diff --git a/go.mod b/go.mod
index ef086cd84..051b44be0 100644
--- a/go.mod
+++ b/go.mod
@@ -19,7 +19,7 @@ require (
 	github.com/pkg/errors v0.9.1
 	github.com/prometheus/client_golang v1.12.1
 	github.com/prometheus/common v0.32.1
-	github.com/segmentio/kafka-go v0.4.35
+	github.com/segmentio/kafka-go v0.4.38
 	github.com/sirupsen/logrus v1.8.1
 	github.com/spf13/cobra v1.3.0
 	github.com/spf13/pflag v1.0.5
@@ -62,7 +62,7 @@ require (
 	github.com/inconshreveable/mousetrap v1.0.0 // indirect
 	github.com/josharian/intern v1.0.0 // indirect
 	github.com/jpillora/backoff v1.0.0 // indirect
-	github.com/klauspost/compress v1.15.7 // indirect
+	github.com/klauspost/compress v1.15.9 // indirect
 	github.com/libp2p/go-reuseport v0.1.0 // indirect
 	github.com/magiconair/properties v1.8.5 // indirect
 	github.com/mailru/easyjson v0.7.6 // indirect
@@ -72,7 +72,6 @@ require (
 	github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
 	github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f // indirect
 	github.com/pelletier/go-toml v1.9.4 // indirect
-	github.com/pierrec/lz4 v2.6.1+incompatible // indirect
 	github.com/pierrec/lz4/v4 v4.1.15 // indirect
 	github.com/pmezard/go-difflib v1.0.0 // indirect
 	github.com/prometheus/client_model v0.2.0 // indirect
diff --git a/go.sum b/go.sum
index 9d85e65bc..546a2fb41 100644
--- a/go.sum
+++ b/go.sum
@@ -202,7 +202,6 @@ github.com/docopt/docopt-go v0.0.0-20180111231733-ee0de3bc6815/go.mod h1:WwZ+bS3
 github.com/dustin/go-humanize v0.0.0-20171111073723-bb3d318650d4/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk=
 github.com/eapache/go-resiliency v1.1.0/go.mod h1:kFI+JgMyC7bLPUVY133qvEBtVayf5mFgVsvEsIPBvNs=
 github.com/eapache/go-resiliency v1.2.0/go.mod h1:kFI+JgMyC7bLPUVY133qvEBtVayf5mFgVsvEsIPBvNs=
-github.com/eapache/go-xerial-snappy v0.0.0-20180814174437-776d5712da21 h1:YEetp8/yCZMuEPMUDHG0CW/brkkEp8mzqk2+ODEitlw=
 github.com/eapache/go-xerial-snappy v0.0.0-20180814174437-776d5712da21/go.mod h1:+020luEh2TKB4/GOp8oxxtq0Daoen/Cii55CzbTV6DU=
 github.com/eapache/queue v1.1.0/go.mod h1:6eCeP0CKFpHLu8blIFXhExK/dRa7WDZfr6jVFPTqq+I=
 github.com/eclipse/paho.mqtt.golang v1.2.0/go.mod h1:H9keYFcgq3Qr5OUJm/JZI/i6U7joQ8SYLhZwfeOo6Ts=
@@ -235,10 +234,7 @@ github.com/form3tech-oss/jwt-go v3.2.3+incompatible/go.mod h1:pbq4aXjuKjdthFRnoD
 github.com/fortytw2/leaktest v1.3.0/go.mod h1:jDsjWgpAGjm2CA7WthBh/CdZYEPF31XHquHwclZch5g=
 github.com/franela/goblin v0.0.0-20200105215937-c9ffbefa60db/go.mod h1:7dvUGVsVBjqR7JHJk0brhHOZYGmfBYOrK0ZhYMEtBr4=
 github.com/franela/goreq v0.0.0-20171204163338-bcd34c9993f8/go.mod h1:ZhphrRTfi2rbfLwlschooIH4+wKKDR4Pdxhh+TRoA20=
-github.com/frankban/quicktest v1.10.2 h1:19ARM85nVi4xH7xPXuc5eM/udya5ieh7b/Sv+d844Tk=
 github.com/frankban/quicktest v1.10.2/go.mod h1:K+q6oSqb0W0Ininfk863uOk1lMy69l/P6txr3mVT54s=
-github.com/frankban/quicktest v1.11.3 h1:8sXhOn0uLys67V8EsXLc6eszDs8VXWxL3iRvebPhedY=
-github.com/frankban/quicktest v1.11.3/go.mod h1:wRf/ReqHper53s+kmmSZizM8NamnL3IM0I9ntUbOk+k=
 github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo=
 github.com/fsnotify/fsnotify v1.4.9/go.mod h1:znqG4EE+3YCdAaPaxE2ZRY/06pZUdp0tY4IgpuI1SZQ=
 github.com/fsnotify/fsnotify v1.5.1 h1:mZcQUHVQUQWoPXXtuf9yuEXKudkV2sx1E06UadKWpgI=
@@ -591,12 +587,9 @@ github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI
 github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
 github.com/klauspost/compress v1.4.0/go.mod h1:RyIbtBH6LamlWaDj8nUwkbUhJ87Yi3uG0guNDohfE1A=
 github.com/klauspost/compress v1.9.5/go.mod h1:RyIbtBH6LamlWaDj8nUwkbUhJ87Yi3uG0guNDohfE1A=
-github.com/klauspost/compress v1.9.8/go.mod h1:RyIbtBH6LamlWaDj8nUwkbUhJ87Yi3uG0guNDohfE1A=
 github.com/klauspost/compress v1.11.0/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs=
-github.com/klauspost/compress v1.13.6 h1:P76CopJELS0TiO2mebmnzgWaajssP/EszplttgQxcgc=
-github.com/klauspost/compress v1.13.6/go.mod h1:/3/Vjq9QcHkK5uEr5lBEmyoZ1iFhe47etQ6QUkpK6sk=
-github.com/klauspost/compress v1.15.7 h1:7cgTQxJCU/vy+oP/E3B9RGbQTgbiVzIJWIKOLoAsPok=
-github.com/klauspost/compress v1.15.7/go.mod h1:PhcZ0MbTNciWF3rruxRgKxI5NkcHHrHUDtV4Yw2GlzU=
+github.com/klauspost/compress v1.15.9 h1:wKRjX6JRtDdrE9qwa4b/Cip7ACOshUI4smpCQanqjSY=
+github.com/klauspost/compress v1.15.9/go.mod h1:PhcZ0MbTNciWF3rruxRgKxI5NkcHHrHUDtV4Yw2GlzU=
 github.com/klauspost/cpuid v0.0.0-20170728055534-ae7887de9fa5/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgoMS4s3ek=
 github.com/klauspost/crc32 v0.0.0-20161016154125-cb6bfca970f6/go.mod h1:+ZoRqAPRLkC4NPOvfYeR5KNOrY6TD+/sAC3HXPZgDYg=
 github.com/klauspost/pgzip v1.0.2-0.20170402124221-0bf5dcad4ada/go.mod h1:Ch1tH69qFZu15pkjo5kYi6mth2Zzwzt50oCQKQE9RUs=
@@ -704,8 +697,6 @@ github.com/netobserv/gopipes v0.2.0 h1:CnJQq32+xNuM85eVYy/HOf+StTJdh2K6RdaEg7NAJ
 github.com/netobserv/gopipes v0.2.0/go.mod h1:eGoHZW1ON8Dx/zmDXUhsbVNqatPjtpdO0UZBmGZGmVI=
 github.com/netobserv/loki-client-go v0.0.0-20220927092034-f37122a54500 h1:RmnoJe/ci5q+QdM7upFdxiU+D8F3L3qTd5wXCwwHefw=
 github.com/netobserv/loki-client-go v0.0.0-20220927092034-f37122a54500/go.mod h1:LHXpc5tjKvsfZn0pwLKrvlgEhZcCaw3Di9mUEZGAI4E=
-github.com/netobserv/netobserv-ebpf-agent v0.1.1-0.20220608092850-3fd4695b7cc2 h1:K7SjoqEfzpMfIjHV85Lg8UDMvZu8rPfrsgKRoo7W30o=
-github.com/netobserv/netobserv-ebpf-agent v0.1.1-0.20220608092850-3fd4695b7cc2/go.mod h1:996FEHp8Xj+AKCkiN4eH3dl/yF2DzuYM0kchWZOrapM=
 github.com/netobserv/netobserv-ebpf-agent v0.2.2-0.20221031160841-ccb7addde22f h1:SBWcIM6Y3u9YQHyMmO2MwTa1OvzffZxhF/NR9O/DH3I=
 github.com/netobserv/netobserv-ebpf-agent v0.2.2-0.20221031160841-ccb7addde22f/go.mod h1:R42+rKAUVXzr0KExHMhPu1FlTg6I7lCyNnV9ZrBhNUI=
 github.com/netobserv/prometheus-common v0.31.2-0.20220720134304-43e74fd22881 h1:hx5bi6xBovRjmwUoVJBzhJ3EDo4K4ZUsqqKrJuQ2vMI=
@@ -765,9 +756,6 @@ github.com/philhofer/fwd v1.0.0/go.mod h1:gk3iGcWd9+svBvR0sR+KPcfE+RNWozjowpeBVG
 github.com/pierrec/lz4 v1.0.2-0.20190131084431-473cd7ce01a1/go.mod h1:3/3N9NVKO0jef7pBehbT1qWhCMrIgbYNnFAZCqQ5LRc=
 github.com/pierrec/lz4 v2.0.5+incompatible/go.mod h1:pdkljMzZIN41W+lC3N2tnIh5sFi+IEE17M5jbnwPHcY=
 github.com/pierrec/lz4 v2.5.2+incompatible/go.mod h1:pdkljMzZIN41W+lC3N2tnIh5sFi+IEE17M5jbnwPHcY=
-github.com/pierrec/lz4 v2.6.0+incompatible/go.mod h1:pdkljMzZIN41W+lC3N2tnIh5sFi+IEE17M5jbnwPHcY=
-github.com/pierrec/lz4 v2.6.1+incompatible h1:9UY3+iC23yxF0UfGaYrGplQ+79Rg+h/q9FV9ix19jjM=
-github.com/pierrec/lz4 v2.6.1+incompatible/go.mod h1:pdkljMzZIN41W+lC3N2tnIh5sFi+IEE17M5jbnwPHcY=
 github.com/pierrec/lz4/v4 v4.1.15 h1:MO0/ucJhngq7299dKLwIMtgTfbkoSPF6AoMYDd8Q4q0=
 github.com/pierrec/lz4/v4 v4.1.15/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4=
 github.com/pion/dtls/v2 v2.0.3/go.mod h1:TUjyL8bf8LH95h81Xj7kATmzMRt29F/4lxpIPj2Xe4Y=
@@ -850,10 +838,8 @@ github.com/satori/go.uuid v1.2.0/go.mod h1:dA0hQrYB0VpLJoorglMZABFdXlWrHn1NEOzdh
 github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529/go.mod h1:DxrIzT+xaE7yg65j358z/aeFdxmN0P9QXhEzd20vsDc=
 github.com/segmentio/kafka-go v0.1.0/go.mod h1:X6itGqS9L4jDletMsxZ7Dz+JFWxM6JHfPOCvTvk+EJo=
 github.com/segmentio/kafka-go v0.2.0/go.mod h1:X6itGqS9L4jDletMsxZ7Dz+JFWxM6JHfPOCvTvk+EJo=
-github.com/segmentio/kafka-go v0.4.28 h1:ATYbyenAlsoFxnV+VpIJMF87bvRuRsX7fezHNfpwkdM=
-github.com/segmentio/kafka-go v0.4.28/go.mod h1:XzMcoMjSzDGHcIwpWUI7GB43iKZ2fTVmryPSGLf/MPg=
-github.com/segmentio/kafka-go v0.4.35 h1:TAsQ7q1SjS39PcFvU0zDJhCuVAxHomy7xOAfbdSuhzs=
-github.com/segmentio/kafka-go v0.4.35/go.mod h1:GAjxBQJdQMB5zfNA21AhpaqOB2Mu+w3De4ni3Gbm8y0=
+github.com/segmentio/kafka-go v0.4.38 h1:iQdOBbUSdfuYlFpvjuALgj7N6DrdPA0HfB4AhREOdtg=
+github.com/segmentio/kafka-go v0.4.38/go.mod h1:ikyuGon/60MN/vXFgykf7Zm8P5Be49gJU6vezwjnnhU=
 github.com/sergi/go-diff v1.0.0/go.mod h1:0CfEIISq7TuYL3j771MWULgwwjU+GofnZX9QAmXWZgo=
 github.com/shurcooL/httpfs v0.0.0-20190707220628-8d4bc4ba7749/go.mod h1:ZY1cvUeJuFPAdZ/B6v7RHavJWZn2YPVFQ1OSXhCGOkg=
 github.com/shurcooL/sanitized_anchor_name v1.0.0/go.mod h1:1NzhyTcUVG4SuEtjjoZeVRXNmyL/1OwPU0+IJeTBvfc=
@@ -901,7 +887,6 @@ github.com/streadway/amqp v0.0.0-20190827072141-edfb9018d271/go.mod h1:AZpEONHx3
 github.com/streadway/handy v0.0.0-20190108123426-d5acb3125c2a/go.mod h1:qNTQ5P5JnDBl6z3cMAg/SywNDC5ABu5ApDIw6lUbRmI=
 github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
 github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
-github.com/stretchr/objx v0.2.0 h1:Hbg2NidpLE8veEBkEZTL3CvlkUIVzuU9jDplZO54c48=
 github.com/stretchr/objx v0.2.0/go.mod h1:qt09Ya8vawLte6SNmTgCsAVtYtaKzEcn8ATUoHMkEqE=
 github.com/stretchr/objx v0.4.0 h1:M2gUjqZET1qApGOWNSnZ49BAIMX4F/1plDv3+l31EJ4=
 github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
@@ -912,7 +897,6 @@ github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81P
 github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA=
 github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
 github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
-github.com/stretchr/testify v1.7.1 h1:5TQK59W5E3v0r2duFAb7P95B6hEeOyEnHRa8MjYSMTY=
 github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
 github.com/stretchr/testify v1.8.0 h1:pSgiaMZlXftHpm5L7V1+rVB+AZJydKsMxsQBIJw4PKk=
 github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
@@ -933,12 +917,10 @@ github.com/vladimirvivien/gexe v0.1.1/go.mod h1:LHQL00w/7gDUKIak24n801ABp8C+ni6e
 github.com/vmware/go-ipfix v0.5.12 h1:mqQknlvnvDY25apPNy9c27ri3FMDFIhzvO68Kk5Qp58=
 github.com/vmware/go-ipfix v0.5.12/go.mod h1:yzbG1rv+yJ8GeMrRm+MDhOV3akygNZUHLhC1pDoD2AY=
 github.com/willf/bitset v1.1.3/go.mod h1:RjeCKbqT1RxIR/KWY6phxZiaY1IyutSBfGjNPySAYV4=
-github.com/xdg/scram v0.0.0-20180814205039-7eeb5667e42c h1:u40Z8hqBAAQyv+vATcGgV0YCnDjqSL7/q/JyPhhJSPk=
 github.com/xdg/scram v0.0.0-20180814205039-7eeb5667e42c/go.mod h1:lB8K/P019DLNhemzwFU4jHLhdvlE6uDZjXFejJXr49I=
 github.com/xdg/scram v1.0.5 h1:TuS0RFmt5Is5qm9Tm2SoD89OPqe4IRiFtyFY4iwWXsw=
 github.com/xdg/scram v1.0.5/go.mod h1:lB8K/P019DLNhemzwFU4jHLhdvlE6uDZjXFejJXr49I=
 github.com/xdg/stringprep v0.0.0-20180714160509-73f8eece6fdc/go.mod h1:Jhud4/sHMO4oL310DaZAKk9ZaJ08SJfe+sJh0HrGL1Y=
-github.com/xdg/stringprep v1.0.0 h1:d9X0esnoa3dFsV0FG35rAT0RIhYFlPq7MiP+DW89La0=
 github.com/xdg/stringprep v1.0.0/go.mod h1:Jhud4/sHMO4oL310DaZAKk9ZaJ08SJfe+sJh0HrGL1Y=
 github.com/xdg/stringprep v1.0.3 h1:cmL5Enob4W83ti/ZHuZLuKD/xqJfus4fVPwE+/BDm+4=
 github.com/xdg/stringprep v1.0.3/go.mod h1:Jhud4/sHMO4oL310DaZAKk9ZaJ08SJfe+sJh0HrGL1Y=
@@ -994,7 +976,6 @@ golang.org/x/crypto v0.0.0-20181029021203-45a5f77698d3/go.mod h1:6SG95UA2DQfeDnf
 golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
 golang.org/x/crypto v0.0.0-20190320223903-b7391e95e576/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
 golang.org/x/crypto v0.0.0-20190422162423-af44ce270edf/go.mod h1:WFFai1msRO1wXaEeE5yQxYXgSfI8pQAWXbQop6sCtWE=
-golang.org/x/crypto v0.0.0-20190506204251-e1dfcc566284/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
 golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
 golang.org/x/crypto v0.0.0-20190530122614-20be4c3c3ed5/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
 golang.org/x/crypto v0.0.0-20190605123033-f99c8df09eb5/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
@@ -1011,7 +992,6 @@ golang.org/x/crypto v0.0.0-20200820211705-5c72a883971a/go.mod h1:LzIPMQfyMNhhGPh
 golang.org/x/crypto v0.0.0-20201002170205-7f63de1d35b0/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
 golang.org/x/crypto v0.0.0-20210220033148-5ea612d1eb83/go.mod h1:jdWPYTVW3xRLrWPugEBEK3UY2ZEsg3UU495nc5E+M+I=
 golang.org/x/crypto v0.0.0-20210817164053-32db794688a5/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
-golang.org/x/crypto v0.0.0-20220214200702-86341886e292 h1:f+lwQ+GtmgoY+A2YaQxlSOnDjXcQ7ZRLWOHbC6HtRqE=
 golang.org/x/crypto v0.0.0-20220214200702-86341886e292/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4=
 golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d h1:sK3txAijHtOK88l68nt020reeT1ZdKLIYetKl95FzVY=
 golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4=
@@ -1112,8 +1092,6 @@ golang.org/x/net v0.0.0-20210525063256-abc453219eb5/go.mod h1:9nx3DQGgdP8bBQD5qx
 golang.org/x/net v0.0.0-20210813160813-60bc85c4be6d/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
 golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
 golang.org/x/net v0.0.0-20220127200216-cd36cc0744dd/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk=
-golang.org/x/net v0.0.0-20220412020605-290c469a71a5 h1:bRb386wvrE+oBNdF1d/Xh9mQrfQ4ecYhW5qJ5GvTGT4=
-golang.org/x/net v0.0.0-20220412020605-290c469a71a5/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk=
 golang.org/x/net v0.0.0-20220706163947-c90051bbdb60 h1:8NSylCMxLW4JvserAndSgFL7aPli6A68yf0bYFTcWCM=
 golang.org/x/net v0.0.0-20220706163947-c90051bbdb60/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
 golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
@@ -1244,8 +1222,6 @@ golang.org/x/sys v0.0.0-20211205182925-97ca703d548d/go.mod h1:oPkhp1MJrh7nUepCBc
 golang.org/x/sys v0.0.0-20211216021012-1d35b9e2eb4e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.0.0-20220114195835-da31bd327af9/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.0.0-20220209214540-3681064d5158/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.0.0-20220412211240-33da011f77ad h1:ntjMns5wyP/fN65tdBD4g8J5w8n015+iIIs9rtjXkY0=
-golang.org/x/sys v0.0.0-20220412211240-33da011f77ad/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a h1:dGzPydgVsqGcTRVwiLJ1jVbufYwmzD3LfVPLKsKg+0k=
 golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/term v0.0.0-20201117132131-f5c789dd3221/go.mod h1:Nr5EML6q2oocZ2LXRh80K7BxOlk5/8JxuGnuhpl+muw=
@@ -1566,7 +1542,6 @@ gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY=
 gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ=
 gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
 gopkg.in/yaml.v3 v3.0.0-20200615113413-eeeca48fe776/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
-gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b h1:h8qDotaEPuJATrMmW04NCwg7v22aHH28wwpauUhK9Oo=
 gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
 gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
 gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
diff --git a/pkg/pipeline/encode/encode_prom.go b/pkg/pipeline/encode/encode_prom.go
index 8e278cf08..a57f82d92 100644
--- a/pkg/pipeline/encode/encode_prom.go
+++ b/pkg/pipeline/encode/encode_prom.go
@@ -83,7 +83,7 @@ var (
 
 // Encode encodes a metric before being stored
 func (e *EncodeProm) Encode(metricRecord config.GenericMap) {
-	log.Debugf("entering EncodeMetric. metricRecord = %v", metricRecord)
+	log.Tracef("entering EncodeMetric. metricRecord = %v", metricRecord)
 
 	// Process counters
 	for _, mInfo := range e.counters {
diff --git a/pkg/pipeline/ingest/ingest_grpc.go b/pkg/pipeline/ingest/ingest_grpc.go
index 283b2c19c..c6f6b7712 100644
--- a/pkg/pipeline/ingest/ingest_grpc.go
+++ b/pkg/pipeline/ingest/ingest_grpc.go
@@ -11,12 +11,14 @@ import (
 	"github.com/netobserv/flowlogs-pipeline/pkg/pipeline/utils"
 	"github.com/netobserv/netobserv-ebpf-agent/pkg/grpc"
 	"github.com/netobserv/netobserv-ebpf-agent/pkg/pbflow"
-	log "github.com/sirupsen/logrus"
+	"github.com/sirupsen/logrus"
 	grpc2 "google.golang.org/grpc"
 	"google.golang.org/grpc/status"
 	"google.golang.org/protobuf/proto"
 )
 
+var glog = logrus.WithField("component", "ingest.GRPCProtobuf")
+
 const (
 	defaultBufferLen = 100
 )
@@ -62,7 +64,7 @@ func (no *GRPCProtobuf) Ingest(out chan<- config.GenericMap) {
 		no.collector.Close()
 	}()
 	for fp := range no.flowPackets {
-		log.Debugf("GRPCProtobuf records len = %v", len(fp.Entries))
+		glog.Debugf("Ingested %v records", len(fp.Entries))
 		for _, entry := range fp.Entries {
 			out <- decode.PBFlowToMap(entry)
 		}
diff --git a/pkg/pipeline/ingest/ingest_kafka.go b/pkg/pipeline/ingest/ingest_kafka.go
index 6571f94ea..353112f74 100644
--- a/pkg/pipeline/ingest/ingest_kafka.go
+++ b/pkg/pipeline/ingest/ingest_kafka.go
@@ -27,13 +27,16 @@ import (
 	"github.com/netobserv/flowlogs-pipeline/pkg/pipeline/decode"
 	"github.com/netobserv/flowlogs-pipeline/pkg/pipeline/utils"
 	kafkago "github.com/segmentio/kafka-go"
-	log "github.com/sirupsen/logrus"
+	"github.com/sirupsen/logrus"
 	"golang.org/x/net/context"
 )
 
+var klog = logrus.WithField("component", "ingest.Kafka")
+
 type kafkaReadMessage interface {
 	ReadMessage(ctx context.Context) (kafkago.Message, error)
 	Config() kafkago.ReaderConfig
+	Stats() kafkago.ReaderStats
 }
 
 type ingestKafka struct {
@@ -51,9 +54,11 @@ const defaultBatchReadTimeout = int64(1000)
 const defaultKafkaBatchMaxLength = 500
 const defaultKafkaCommitInterval = 500
 
+const kafkaStatsPeriod = 15 * time.Second
+
 // Ingest ingests entries from kafka topic
 func (k *ingestKafka) Ingest(out chan<- config.GenericMap) {
-	log.Debugf("entering ingestKafka.Ingest")
+	klog.Debugf("entering ingestKafka.Ingest")
 	k.metrics.createOutQueueLen(out)
 
 	// initialize background listener
@@ -65,19 +70,27 @@ func (k *ingestKafka) Ingest(out chan<- config.GenericMap) {
 
 // background thread to read kafka messages; place received items into ingestKafka input channel
 func (k *ingestKafka) kafkaListener() {
-	log.Debugf("entering kafkaListener")
+	klog.Debugf("entering kafkaListener")
+
+	if logrus.IsLevelEnabled(logrus.DebugLevel) {
+		go k.reportStats()
+	}
 
 	go func() {
 		for {
+			if k.isStopped() {
+				klog.Info("gracefully exiting")
+				return
+			}
+			klog.Trace("fetching messages from Kafka")
 			// block until a message arrives
-			log.Debugf("before ReadMessage")
 			kafkaMessage, err := k.kafkaReader.ReadMessage(context.Background())
 			if err != nil {
-				log.Errorln(err)
+				klog.Errorln(err)
 				continue
 			}
-			if k.canLogMessages {
-				log.Debugf("string(kafkaMessage) = %s\n", string(kafkaMessage.Value))
+			if k.canLogMessages && logrus.IsLevelEnabled(logrus.TraceLevel) {
+				klog.Tracef("string(kafkaMessage) = %s\n", string(kafkaMessage.Value))
 			}
 			k.metrics.flowsProcessed.Inc()
 			messageLen := len(kafkaMessage.Value)
@@ -90,6 +103,15 @@ func (k *ingestKafka) kafkaListener() {
 	}()
 }
 
+func (k *ingestKafka) isStopped() bool {
+	select {
+	case <-k.exitChan:
+		return true
+	default:
+		return false
+	}
+}
+
 func (k *ingestKafka) processRecordDelay(record config.GenericMap) {
 	TimeFlowEndInterface, ok := record["TimeFlowEnd"]
 	if !ok {
@@ -109,7 +131,7 @@ func (k *ingestKafka) processRecord(record []byte, out chan<- config.GenericMap)
 	// Decode batch
 	decoded, err := k.decoder.Decode(record)
 	if err != nil {
-		log.WithError(err).Warnf("ignoring flow")
+		klog.WithError(err).Warnf("ignoring flow")
 		return
 	}
 	k.processRecordDelay(decoded)
@@ -123,7 +145,7 @@ func (k *ingestKafka) processLogLines(out chan<- config.GenericMap) {
 	for {
 		select {
 		case <-k.exitChan:
-			log.Debugf("exiting ingestKafka because of signal")
+			klog.Debugf("exiting ingestKafka because of signal")
 			return
 		case record := <-k.in:
 			k.processRecord(record, out)
@@ -131,9 +153,23 @@ func (k *ingestKafka) processLogLines(out chan<- config.GenericMap) {
 	}
 }
 
+// reportStats periodically reports kafka stats
+func (k *ingestKafka) reportStats() {
+	ticker := time.NewTicker(kafkaStatsPeriod)
+	defer ticker.Stop()
+	for {
+		select {
+		case <-k.exitChan:
+			klog.Debug("gracefully exiting stats reporter")
+		case <-ticker.C:
+			klog.Debugf("reader stats: %#v", k.kafkaReader.Stats())
+		}
+	}
+}
+
 // NewIngestKafka create a new ingester
 func NewIngestKafka(opMetrics *operational.Metrics, params config.StageParam) (Ingester, error) {
-	log.Debugf("entering NewIngestKafka")
+	klog.Debugf("entering NewIngestKafka")
 	jsonIngestKafka := api.IngestKafka{}
 	if params.Ingest != nil && params.Ingest.Kafka != nil {
 		jsonIngestKafka = *params.Ingest.Kafka
@@ -149,9 +185,9 @@ func NewIngestKafka(opMetrics *operational.Metrics, params config.StageParam) (I
 		startOffset = kafkago.LastOffset
 	default:
 		startOffset = kafkago.FirstOffset
-		log.Errorf("illegal value for StartOffset: %s\n", startOffsetString)
+		klog.Errorf("illegal value for StartOffset: %s\n", startOffsetString)
 	}
-	log.Debugf("startOffset = %v", startOffset)
+	klog.Debugf("startOffset = %v", startOffset)
 	groupBalancers := make([]kafkago.GroupBalancer, 0)
 	for _, gb := range jsonIngestKafka.GroupBalancers {
 		switch gb {
@@ -162,7 +198,7 @@ func NewIngestKafka(opMetrics *operational.Metrics, params config.StageParam) (I
 		case "rackAffinity":
 			groupBalancers = append(groupBalancers, &kafkago.RackAffinityGroupBalancer{})
 		default:
-			log.Warningf("groupbalancers parameter missing")
+			klog.Warningf("groupbalancers parameter missing")
 			groupBalancers = append(groupBalancers, &kafkago.RoundRobinGroupBalancer{})
 		}
 	}
@@ -171,20 +207,20 @@ func NewIngestKafka(opMetrics *operational.Metrics, params config.StageParam) (I
 	if jsonIngestKafka.BatchReadTimeout != 0 {
 		batchReadTimeout = jsonIngestKafka.BatchReadTimeout
 	}
-	log.Infof("batchReadTimeout = %d", batchReadTimeout)
+	klog.Infof("batchReadTimeout = %d", batchReadTimeout)
 
 	commitInterval := int64(defaultKafkaCommitInterval)
 	if jsonIngestKafka.CommitInterval != 0 {
 		commitInterval = jsonIngestKafka.CommitInterval
 	}
-	log.Infof("commitInterval = %d", jsonIngestKafka.CommitInterval)
+	klog.Infof("commitInterval = %d", jsonIngestKafka.CommitInterval)
 
 	dialer := &kafkago.Dialer{
 		Timeout:   kafkago.DefaultDialer.Timeout,
 		DualStack: kafkago.DefaultDialer.DualStack,
 	}
 	if jsonIngestKafka.TLS != nil {
-		log.Infof("Using TLS configuration: %v", jsonIngestKafka.TLS)
+		klog.Infof("Using TLS configuration: %v", jsonIngestKafka.TLS)
 		tlsConfig, err := jsonIngestKafka.TLS.Build()
 		if err != nil {
 			return nil, err
@@ -203,22 +239,21 @@ func NewIngestKafka(opMetrics *operational.Metrics, params config.StageParam) (I
 	}
 
 	if jsonIngestKafka.PullQueueCapacity > 0 {
-		log.Infof("pullQueueCapacity = %d", jsonIngestKafka.PullQueueCapacity)
 		readerConfig.QueueCapacity = jsonIngestKafka.PullQueueCapacity
 	}
 
 	if jsonIngestKafka.PullMaxBytes > 0 {
-		log.Infof("pullMaxBytes = %d", jsonIngestKafka.PullMaxBytes)
 		readerConfig.MaxBytes = jsonIngestKafka.PullMaxBytes
 	}
 
+	klog.Debugf("reader config: %#v", readerConfig)
+
 	kafkaReader := kafkago.NewReader(readerConfig)
 	if kafkaReader == nil {
 		errMsg := "NewIngestKafka: failed to create kafka-go reader"
-		log.Errorf("%s", errMsg)
+		klog.Errorf("%s", errMsg)
 		return nil, errors.New(errMsg)
 	}
-	log.Debugf("kafkaReader = %v", kafkaReader)
 
 	decoder, err := decode.GetDecoder(jsonIngestKafka.Decoder)
 	if err != nil {
diff --git a/pkg/pipeline/ingest/ingest_kafka_test.go b/pkg/pipeline/ingest/ingest_kafka_test.go
index 7adcddf7c..150c81c46 100644
--- a/pkg/pipeline/ingest/ingest_kafka_test.go
+++ b/pkg/pipeline/ingest/ingest_kafka_test.go
@@ -180,6 +180,10 @@ func (f *fakeKafkaReader) Config() kafkago.ReaderConfig {
 	return kafkago.ReaderConfig{}
 }
 
+func (f *fakeKafkaReader) Stats() kafkago.ReaderStats {
+	return kafkago.ReaderStats{}
+}
+
 func Test_KafkaListener(t *testing.T) {
 	ingestOutput := make(chan config.GenericMap)
 	newIngest := initNewIngestKafka(t, testConfig1)
diff --git a/pkg/pipeline/transform/transform.go b/pkg/pipeline/transform/transform.go
index 598fb83bf..5b51a41ae 100644
--- a/pkg/pipeline/transform/transform.go
+++ b/pkg/pipeline/transform/transform.go
@@ -20,7 +20,7 @@ package transform
 import (
 	"github.com/netobserv/flowlogs-pipeline/pkg/api"
 	"github.com/netobserv/flowlogs-pipeline/pkg/config"
-	log "github.com/sirupsen/logrus"
+	"github.com/sirupsen/logrus"
 )
 
 type Transformer interface {
@@ -37,7 +37,7 @@ func (t *transformNone) Transform(f config.GenericMap) (config.GenericMap, bool)
 
 // NewTransformNone create a new transform
 func NewTransformNone() (Transformer, error) {
-	log.Debugf("entering NewTransformNone")
+	logrus.Debugf("entering NewTransformNone")
 	return &transformNone{}, nil
 }
 
diff --git a/pkg/pipeline/transform/transform_filter.go b/pkg/pipeline/transform/transform_filter.go
index 9cca6cb18..9db5de2a0 100644
--- a/pkg/pipeline/transform/transform_filter.go
+++ b/pkg/pipeline/transform/transform_filter.go
@@ -20,19 +20,21 @@ package transform
 import (
 	"github.com/netobserv/flowlogs-pipeline/pkg/api"
 	"github.com/netobserv/flowlogs-pipeline/pkg/config"
-	log "github.com/sirupsen/logrus"
+	"github.com/sirupsen/logrus"
 )
 
+var tlog = logrus.WithField("component", "transform.Filter")
+
 type Filter struct {
 	Rules []api.TransformFilterRule
 }
 
 // Transform transforms a flow
 func (f *Filter) Transform(entry config.GenericMap) (config.GenericMap, bool) {
-	log.Tracef("f = %v", f)
+	tlog.Tracef("f = %v", f)
 	outputEntry := entry.Copy()
 	for _, rule := range f.Rules {
-		log.Debugf("rule = %v", rule)
+		tlog.Tracef("rule = %v", rule)
 		switch rule.Type {
 		case api.TransformFilterOperationName("RemoveField"):
 			delete(outputEntry, rule.Input)
@@ -57,7 +59,7 @@ func (f *Filter) Transform(entry config.GenericMap) (config.GenericMap, bool) {
 				}
 			}
 		default:
-			log.Panicf("unknown type %s for transform.Filter rule: %v", rule.Type, rule)
+			tlog.Panicf("unknown type %s for transform.Filter rule: %v", rule.Type, rule)
 		}
 	}
 	return outputEntry, true
@@ -65,7 +67,7 @@ func (f *Filter) Transform(entry config.GenericMap) (config.GenericMap, bool) {
 
 // NewTransformFilter create a new filter transform
 func NewTransformFilter(params config.StageParam) (Transformer, error) {
-	log.Debugf("entering NewTransformFilter")
+	tlog.Debugf("entering NewTransformFilter")
 	rules := []api.TransformFilterRule{}
 	if params.Transform != nil && params.Transform.Filter != nil {
 		rules = params.Transform.Filter.Rules
diff --git a/pkg/pipeline/transform/transform_generic.go b/pkg/pipeline/transform/transform_generic.go
index 9b9a84bcf..57765abb9 100644
--- a/pkg/pipeline/transform/transform_generic.go
+++ b/pkg/pipeline/transform/transform_generic.go
@@ -20,9 +20,11 @@ package transform
 import (
 	"github.com/netobserv/flowlogs-pipeline/pkg/api"
 	"github.com/netobserv/flowlogs-pipeline/pkg/config"
-	log "github.com/sirupsen/logrus"
+	"github.com/sirupsen/logrus"
 )
 
+var glog = logrus.WithField("component", "transform.Generic")
+
 type Generic struct {
 	policy string
 	rules  []api.GenericTransformRule
@@ -30,45 +32,41 @@ type Generic struct {
 
 // Transform transforms a flow to a new set of keys
 func (g *Generic) Transform(entry config.GenericMap) (config.GenericMap, bool) {
-	log.Tracef("entering Generic Transform g = %v", g)
-	log.Tracef("entry.GenericMap = %v", entry)
-	outputEntry := make(config.GenericMap)
+	var outputEntry config.GenericMap
+	log.Tracef("Transform input = %v", entry)
 	if g.policy != "replace_keys" {
-		// copy old map to new map
-		for key, value := range entry {
-			outputEntry[key] = value
-		}
+		outputEntry = entry.Copy()
+	} else {
+		outputEntry = config.GenericMap{}
 	}
-	log.Debugf("outputEntry = %v", outputEntry)
 	for _, transformRule := range g.rules {
-		log.Debugf("transformRule = %v", transformRule)
 		outputEntry[transformRule.Output] = entry[transformRule.Input]
 	}
-	log.Debugf("Transform.GenericMap = %v", outputEntry)
+	glog.Tracef("Transform output = %v", outputEntry)
 	return outputEntry, true
 }
 
 // NewTransformGeneric create a new transform
 func NewTransformGeneric(params config.StageParam) (Transformer, error) {
-	log.Debugf("entering NewTransformGeneric")
+	glog.Debugf("entering NewTransformGeneric")
 	genConfig := api.TransformGeneric{}
 	if params.Transform != nil && params.Transform.Generic != nil {
 		genConfig = *params.Transform.Generic
 	}
-	log.Debugf("params.Transform.Generic = %v", genConfig)
+	glog.Debugf("params.Transform.Generic = %v", genConfig)
 	rules := genConfig.Rules
 	policy := genConfig.Policy
 	switch policy {
 	case "replace_keys", "preserve_original_keys", "":
 		// valid; nothing to do
-		log.Infof("NewTransformGeneric, policy = %s", policy)
+		glog.Infof("NewTransformGeneric, policy = %s", policy)
 	default:
-		log.Panicf("unknown policy %s for transform.generic", policy)
+		glog.Panicf("unknown policy %s for transform.generic", policy)
 	}
 	transformGeneric := &Generic{
 		policy: policy,
 		rules:  rules,
 	}
-	log.Debugf("transformGeneric = %v", transformGeneric)
+	glog.Debugf("transformGeneric = %v", transformGeneric)
 	return transformGeneric, nil
 }
diff --git a/pkg/pipeline/transform/transform_network.go b/pkg/pipeline/transform/transform_network.go
index 06fe92723..bc78e2831 100644
--- a/pkg/pipeline/transform/transform_network.go
+++ b/pkg/pipeline/transform/transform_network.go
@@ -29,10 +29,12 @@ import (
 	"github.com/netobserv/flowlogs-pipeline/pkg/config"
 	"github.com/netobserv/flowlogs-pipeline/pkg/pipeline/transform/kubernetes"
 	"github.com/netobserv/flowlogs-pipeline/pkg/pipeline/transform/location"
-	netdb "github.com/netobserv/flowlogs-pipeline/pkg/pipeline/transform/netdb"
-	log "github.com/sirupsen/logrus"
+	"github.com/netobserv/flowlogs-pipeline/pkg/pipeline/transform/netdb"
+	"github.com/sirupsen/logrus"
 )
 
+var log = logrus.WithField("component", "transform.Network")
+
 type Network struct {
 	api.TransformNetwork
 	svcNames *netdb.ServiceNames
@@ -116,7 +118,7 @@ func (n *Network) Transform(inputEntry config.GenericMap) (config.GenericMap, bo
 		case api.OpAddKubernetes:
 			kubeInfo, err := kubernetes.Data.GetInfo(fmt.Sprintf("%s", outputEntry[rule.Input]))
 			if err != nil {
-				log.Debugf("Can't find kubernetes info for IP %v err %v", outputEntry[rule.Input], err)
+				logrus.WithError(err).Tracef("can't find kubernetes info for IP %v", outputEntry[rule.Input])
 				continue
 			}
 			// NETOBSERV-666: avoid putting empty namespaces or Loki aggregation queries will
diff --git a/pkg/pipeline/utils/exit.go b/pkg/pipeline/utils/exit.go
index 90c2ef66c..b2c1eb3d7 100644
--- a/pkg/pipeline/utils/exit.go
+++ b/pkg/pipeline/utils/exit.go
@@ -22,7 +22,7 @@ import (
 	"os/signal"
 	"syscall"
 
-	log "github.com/sirupsen/logrus"
+	"github.com/sirupsen/logrus"
 )
 
 var (
@@ -34,18 +34,18 @@ func ExitChannel() <-chan struct{} {
 }
 
 func SetupElegantExit() {
-	log.Debugf("entering SetupElegantExit")
+	logrus.Debugf("entering SetupElegantExit")
 	// handle elegant exit; create support for channels of go routines that want to exit cleanly
 	exitChannel = make(chan struct{})
 	exitSigChan := make(chan os.Signal, 1)
-	log.Debugf("registered exit signal channel")
+	logrus.Debugf("registered exit signal channel")
 	signal.Notify(exitSigChan, syscall.SIGINT, syscall.SIGTERM)
 	go func() {
 		// wait for exit signal; then stop all the other go functions
 		sig := <-exitSigChan
-		log.Debugf("received exit signal = %v", sig)
+		logrus.Debugf("received exit signal = %v", sig)
 		close(exitChannel)
-		log.Debugf("exiting SetupElegantExit go function")
+		logrus.Debugf("exiting SetupElegantExit go function")
 	}()
-	log.Debugf("exiting SetupElegantExit")
+	logrus.Debugf("exiting SetupElegantExit")
 }
diff --git a/pkg/pipeline/utils/timed_cache.go b/pkg/pipeline/utils/timed_cache.go
index f98e81661..034acac93 100644
--- a/pkg/pipeline/utils/timed_cache.go
+++ b/pkg/pipeline/utils/timed_cache.go
@@ -22,9 +22,11 @@ import (
 	"sync"
 	"time"
 
-	log "github.com/sirupsen/logrus"
+	"github.com/sirupsen/logrus"
 )
 
+var log = logrus.WithField("component", "utils.TimedCache")
+
 // Functions to manage an LRU cache with an expiry
 // When an item expires, allow a callback to allow the specific implementation to perform its particular cleanup
 
@@ -58,6 +60,8 @@ func (tc *TimedCache) GetCacheEntry(key string) (interface{}, bool) {
 	}
 }
 
+var uclog = log.WithField("method", "UpdateCacheEntry")
+
 func (tc *TimedCache) UpdateCacheEntry(key string, entry interface{}) *cacheEntry {
 	nowInSecs := time.Now().Unix()
 	tc.mu.Lock()
@@ -75,11 +79,10 @@ func (tc *TimedCache) UpdateCacheEntry(key string, entry interface{}) *cacheEntr
 			key:             key,
 			SourceEntry:     entry,
 		}
+		uclog.Debugf("adding entry: %#v", cEntry)
 		// place at end of list
-		log.Debugf("adding entry = %v", cEntry)
 		cEntry.e = tc.cacheList.PushBack(cEntry)
 		tc.cacheMap[key] = cEntry
-		log.Debugf("cacheList = %v", tc.cacheList)
 	}
 	return cEntry
 }
@@ -103,13 +106,18 @@ func (tc *TimedCache) Iterate(f func(key string, value interface{})) {
 
 // CleanupExpiredEntries removes items from cache that were last touched more than expiryTime seconds ago
 func (tc *TimedCache) CleanupExpiredEntries(expiryTime int64, callback CacheCallback) {
-	log.Debugf("entering cleanupExpiredEntries")
 	tc.mu.Lock()
 	defer tc.mu.Unlock()
-	log.Debugf("cache = %v", tc.cacheMap)
-	log.Debugf("list = %v", tc.cacheList)
+
+	clog := log.WithFields(logrus.Fields{
+		"mapLen":  len(tc.cacheMap),
+		"listLen": tc.cacheList.Len(),
+	})
+	clog.Debugf("cleaning up expried entries")
+
 	nowInSecs := time.Now().Unix()
 	expireTime := nowInSecs - expiryTime
+	deleted := 0
 	// go through the list until we reach recently used entries
 	for {
 		listEntry := tc.cacheList.Front()
@@ -117,12 +125,12 @@ func (tc *TimedCache) CleanupExpiredEntries(expiryTime int64, callback CacheCall
 			return
 		}
 		pCacheInfo := listEntry.Value.(*cacheEntry)
-		log.Debugf("lastUpdatedTime = %d, expireTime = %d", pCacheInfo.lastUpdatedTime, expireTime)
-		log.Debugf("pCacheInfo = %v", pCacheInfo)
 		if pCacheInfo.lastUpdatedTime > expireTime {
 			// no more expired items
+			clog.Debugf("deleted %d expired entries", deleted)
 			return
 		}
+		deleted++
 		callback.Cleanup(pCacheInfo.SourceEntry)
 		delete(tc.cacheMap, pCacheInfo.key)
 		tc.cacheList.Remove(listEntry)
diff --git a/pkg/pipeline/write/write.go b/pkg/pipeline/write/write.go
index 46c2db27f..da05d2207 100644
--- a/pkg/pipeline/write/write.go
+++ b/pkg/pipeline/write/write.go
@@ -21,7 +21,7 @@ import (
 	"sync"
 
 	"github.com/netobserv/flowlogs-pipeline/pkg/config"
-	log "github.com/sirupsen/logrus"
+	"github.com/sirupsen/logrus"
 )
 
 type Writer interface {
@@ -35,7 +35,7 @@ type WriteNone struct {
 
 // Write writes entries
 func (t *WriteNone) Write(in config.GenericMap) {
-	log.Debugf("entering Write none, in = %v", in)
+	logrus.Debugf("entering Write none, in = %v", in)
 	t.mt.Lock()
 	t.prevRecords = append(t.prevRecords, in)
 	t.mt.Unlock()
diff --git a/pkg/pipeline/write/write_fake.go b/pkg/pipeline/write/write_fake.go
index 3c4080352..518573a1c 100644
--- a/pkg/pipeline/write/write_fake.go
+++ b/pkg/pipeline/write/write_fake.go
@@ -21,7 +21,7 @@ import (
 	"sync"
 
 	"github.com/netobserv/flowlogs-pipeline/pkg/config"
-	log "github.com/sirupsen/logrus"
+	"github.com/sirupsen/logrus"
 )
 
 type WriteFake struct {
@@ -32,7 +32,7 @@ type WriteFake struct {
 
 // Write stores in memory all records.
 func (w *WriteFake) Write(in config.GenericMap) {
-	log.Trace("entering writeFake Write")
+	logrus.Trace("entering writeFake Write")
 	w.mt.Lock()
 	w.allRecords = append(w.allRecords, in.Copy())
 	w.mt.Unlock()
@@ -50,7 +50,7 @@ func (w *WriteFake) AllRecords() []config.GenericMap {
 
 // NewWriteFake creates a new write.
 func NewWriteFake(_ config.StageParam) (Writer, error) {
-	log.Debugf("entering NewWriteFake")
+	logrus.Debugf("entering NewWriteFake")
 	w := &WriteFake{}
 	return w, nil
 }
diff --git a/pkg/pipeline/write/write_loki.go b/pkg/pipeline/write/write_loki.go
index 9a53bf11a..fcaaa5fe5 100644
--- a/pkg/pipeline/write/write_loki.go
+++ b/pkg/pipeline/write/write_loki.go
@@ -34,7 +34,7 @@ import (
 	"github.com/netobserv/loki-client-go/pkg/backoff"
 	"github.com/netobserv/loki-client-go/pkg/urlutil"
 	"github.com/prometheus/common/model"
-	log "github.com/sirupsen/logrus"
+	"github.com/sirupsen/logrus"
 )
 
 var jsonEncodingConfig = jsonIter.Config{}.Froze()
@@ -43,6 +43,8 @@ var (
 	keyReplacer = strings.NewReplacer("/", "_", ".", "_", "-", "_")
 )
 
+var log = logrus.WithField("component", "write.Loki")
+
 type emitter interface {
 	Handle(labels model.LabelSet, timestamp time.Time, record string) error
 }
@@ -172,7 +174,7 @@ func (l *Loki) addLabels(record config.GenericMap, labels model.LabelSet) {
 		}
 		lv := model.LabelValue(fmt.Sprint(val))
 		if !lv.IsValid() {
-			log.WithFields(log.Fields{"key": label, "value": val}).
+			log.WithFields(logrus.Fields{"key": label, "value": val}).
 				Debug("Invalid label value. Ignoring it")
 			continue
 		}
@@ -204,10 +206,10 @@ func getFloat64(timestamp interface{}) (ft float64, ok bool) {
 
 // Write writes a flow before being stored
 func (l *Loki) Write(entry config.GenericMap) {
-	log.Debugf("entering Loki Write")
+	log.Tracef("writing entry: %#v", entry)
 	err := l.ProcessRecord(entry)
 	if err != nil {
-		log.Errorf("Write (Loki) error %v", err)
+		log.WithError(err).Warn("can't write into loki")
 	}
 }
 
@@ -246,7 +248,7 @@ func NewWriteLoki(opMetrics *operational.Metrics, params config.StageParam) (*Lo
 		if sanitized.IsValid() {
 			saneLabels[label] = sanitized
 		} else {
-			log.WithFields(log.Fields{"key": label, "sanitized": sanitized}).
+			log.WithFields(logrus.Fields{"key": label, "sanitized": sanitized}).
 				Debug("Invalid label. Ignoring it")
 		}
 	}
diff --git a/pkg/pipeline/write/write_loki_test.go b/pkg/pipeline/write/write_loki_test.go
index 19dda1754..b1623f750 100644
--- a/pkg/pipeline/write/write_loki_test.go
+++ b/pkg/pipeline/write/write_loki_test.go
@@ -31,11 +31,10 @@ import (
 	"github.com/netobserv/flowlogs-pipeline/pkg/operational"
 	"github.com/netobserv/flowlogs-pipeline/pkg/test"
 	"github.com/prometheus/common/model"
+	"github.com/sirupsen/logrus"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/mock"
 	"github.com/stretchr/testify/require"
-
-	log "github.com/sirupsen/logrus"
 )
 
 const timeout = 5 * time.Second
@@ -325,7 +324,7 @@ func hundredFlows() []config.GenericMap {
 }
 
 func BenchmarkWriteLoki(b *testing.B) {
-	log.SetLevel(log.ErrorLevel)
+	logrus.SetLevel(logrus.ErrorLevel)
 	lokiFlows := make(chan map[string]interface{}, 256)
 	fakeLoki := httptest.NewServer(test.FakeLokiHandler(lokiFlows))
 
diff --git a/pkg/pipeline/write/write_stdout.go b/pkg/pipeline/write/write_stdout.go
index f37086e6e..5c228305a 100644
--- a/pkg/pipeline/write/write_stdout.go
+++ b/pkg/pipeline/write/write_stdout.go
@@ -26,7 +26,7 @@ import (
 	"time"
 
 	"github.com/netobserv/flowlogs-pipeline/pkg/config"
-	log "github.com/sirupsen/logrus"
+	"github.com/sirupsen/logrus"
 )
 
 type writeStdout struct {
@@ -35,7 +35,7 @@ type writeStdout struct {
 
 // Write writes a flow before being stored
 func (t *writeStdout) Write(v config.GenericMap) {
-	log.Tracef("entering writeStdout Write")
+	logrus.Tracef("entering writeStdout Write")
 	if t.format == "json" {
 		txt, _ := json.Marshal(v)
 		fmt.Println(string(txt))
@@ -58,7 +58,7 @@ func (t *writeStdout) Write(v config.GenericMap) {
 
 // NewWriteStdout create a new write
 func NewWriteStdout(params config.StageParam) (Writer, error) {
-	log.Debugf("entering NewWriteStdout")
+	logrus.Debugf("entering NewWriteStdout")
 	writeStdout := &writeStdout{}
 	if params.Write != nil && params.Write.Stdout != nil {
 		writeStdout.format = params.Write.Stdout.Format
diff --git a/vendor/github.com/klauspost/compress/README.md b/vendor/github.com/klauspost/compress/README.md
index 1f6b7e9a2..ad5c63a82 100644
--- a/vendor/github.com/klauspost/compress/README.md
+++ b/vendor/github.com/klauspost/compress/README.md
@@ -17,6 +17,25 @@ This package provides various compression algorithms.
 
 # changelog
 
+* July 13, 2022 (v1.15.8)
+
+	* gzip: fix stack exhaustion bug in Reader.Read https://github.com/klauspost/compress/pull/641
+	* s2: Add Index header trim/restore https://github.com/klauspost/compress/pull/638
+	* zstd: Optimize seqdeq amd64 asm by @greatroar in https://github.com/klauspost/compress/pull/636
+	* zstd: Improve decoder memcopy https://github.com/klauspost/compress/pull/637
+	* huff0: Pass a single bitReader pointer to asm by @greatroar in https://github.com/klauspost/compress/pull/634
+	* zstd: Branchless getBits for amd64 w/o BMI2 by @greatroar in https://github.com/klauspost/compress/pull/640
+	* gzhttp: Remove header before writing https://github.com/klauspost/compress/pull/639
+
+* June 29, 2022 (v1.15.7)
+
+	* s2: Fix absolute forward seeks  https://github.com/klauspost/compress/pull/633
+	* zip: Merge upstream  https://github.com/klauspost/compress/pull/631
+	* zip: Re-add zip64 fix https://github.com/klauspost/compress/pull/624
+	* zstd: translate fseDecoder.buildDtable into asm by @WojciechMula in https://github.com/klauspost/compress/pull/598
+	* flate: Faster histograms  https://github.com/klauspost/compress/pull/620
+	* deflate: Use compound hcode  https://github.com/klauspost/compress/pull/622
+
 * June 3, 2022 (v1.15.6)
 	* s2: Improve coding for long, close matches https://github.com/klauspost/compress/pull/613
 	* s2c: Add Snappy/S2 stream recompression https://github.com/klauspost/compress/pull/611
diff --git a/vendor/github.com/klauspost/compress/flate/deflate.go b/vendor/github.com/klauspost/compress/flate/deflate.go
new file mode 100644
index 000000000..f8435998e
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/deflate.go
@@ -0,0 +1,903 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Copyright (c) 2015 Klaus Post
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package flate
+
+import (
+	"encoding/binary"
+	"fmt"
+	"io"
+	"math"
+)
+
+const (
+	NoCompression      = 0
+	BestSpeed          = 1
+	BestCompression    = 9
+	DefaultCompression = -1
+
+	// HuffmanOnly disables Lempel-Ziv match searching and only performs Huffman
+	// entropy encoding. This mode is useful in compressing data that has
+	// already been compressed with an LZ style algorithm (e.g. Snappy or LZ4)
+	// that lacks an entropy encoder. Compression gains are achieved when
+	// certain bytes in the input stream occur more frequently than others.
+	//
+	// Note that HuffmanOnly produces a compressed output that is
+	// RFC 1951 compliant. That is, any valid DEFLATE decompressor will
+	// continue to be able to decompress this output.
+	HuffmanOnly         = -2
+	ConstantCompression = HuffmanOnly // compatibility alias.
+
+	logWindowSize    = 15
+	windowSize       = 1 << logWindowSize
+	windowMask       = windowSize - 1
+	logMaxOffsetSize = 15  // Standard DEFLATE
+	minMatchLength   = 4   // The smallest match that the compressor looks for
+	maxMatchLength   = 258 // The longest match for the compressor
+	minOffsetSize    = 1   // The shortest offset that makes any sense
+
+	// The maximum number of tokens we will encode at the time.
+	// Smaller sizes usually creates less optimal blocks.
+	// Bigger can make context switching slow.
+	// We use this for levels 7-9, so we make it big.
+	maxFlateBlockTokens = 1 << 15
+	maxStoreBlockSize   = 65535
+	hashBits            = 17 // After 17 performance degrades
+	hashSize            = 1 << hashBits
+	hashMask            = (1 << hashBits) - 1
+	hashShift           = (hashBits + minMatchLength - 1) / minMatchLength
+	maxHashOffset       = 1 << 28
+
+	skipNever = math.MaxInt32
+
+	debugDeflate = false
+)
+
+type compressionLevel struct {
+	good, lazy, nice, chain, fastSkipHashing, level int
+}
+
+// Compression levels have been rebalanced from zlib deflate defaults
+// to give a bigger spread in speed and compression.
+// See https://blog.klauspost.com/rebalancing-deflate-compression-levels/
+var levels = []compressionLevel{
+	{}, // 0
+	// Level 1-6 uses specialized algorithm - values not used
+	{0, 0, 0, 0, 0, 1},
+	{0, 0, 0, 0, 0, 2},
+	{0, 0, 0, 0, 0, 3},
+	{0, 0, 0, 0, 0, 4},
+	{0, 0, 0, 0, 0, 5},
+	{0, 0, 0, 0, 0, 6},
+	// Levels 7-9 use increasingly more lazy matching
+	// and increasingly stringent conditions for "good enough".
+	{8, 12, 16, 24, skipNever, 7},
+	{16, 30, 40, 64, skipNever, 8},
+	{32, 258, 258, 1024, skipNever, 9},
+}
+
+// advancedState contains state for the advanced levels, with bigger hash tables, etc.
+type advancedState struct {
+	// deflate state
+	length         int
+	offset         int
+	maxInsertIndex int
+	chainHead      int
+	hashOffset     int
+
+	ii uint16 // position of last match, intended to overflow to reset.
+
+	// input window: unprocessed data is window[index:windowEnd]
+	index          int
+	estBitsPerByte int
+	hashMatch      [maxMatchLength + minMatchLength]uint32
+
+	// Input hash chains
+	// hashHead[hashValue] contains the largest inputIndex with the specified hash value
+	// If hashHead[hashValue] is within the current window, then
+	// hashPrev[hashHead[hashValue] & windowMask] contains the previous index
+	// with the same hash value.
+	hashHead [hashSize]uint32
+	hashPrev [windowSize]uint32
+}
+
+type compressor struct {
+	compressionLevel
+
+	h *huffmanEncoder
+	w *huffmanBitWriter
+
+	// compression algorithm
+	fill func(*compressor, []byte) int // copy data to window
+	step func(*compressor)             // process window
+
+	window     []byte
+	windowEnd  int
+	blockStart int // window index where current tokens start
+	err        error
+
+	// queued output tokens
+	tokens tokens
+	fast   fastEnc
+	state  *advancedState
+
+	sync          bool // requesting flush
+	byteAvailable bool // if true, still need to process window[index-1].
+}
+
+func (d *compressor) fillDeflate(b []byte) int {
+	s := d.state
+	if s.index >= 2*windowSize-(minMatchLength+maxMatchLength) {
+		// shift the window by windowSize
+		copy(d.window[:], d.window[windowSize:2*windowSize])
+		s.index -= windowSize
+		d.windowEnd -= windowSize
+		if d.blockStart >= windowSize {
+			d.blockStart -= windowSize
+		} else {
+			d.blockStart = math.MaxInt32
+		}
+		s.hashOffset += windowSize
+		if s.hashOffset > maxHashOffset {
+			delta := s.hashOffset - 1
+			s.hashOffset -= delta
+			s.chainHead -= delta
+			// Iterate over slices instead of arrays to avoid copying
+			// the entire table onto the stack (Issue #18625).
+			for i, v := range s.hashPrev[:] {
+				if int(v) > delta {
+					s.hashPrev[i] = uint32(int(v) - delta)
+				} else {
+					s.hashPrev[i] = 0
+				}
+			}
+			for i, v := range s.hashHead[:] {
+				if int(v) > delta {
+					s.hashHead[i] = uint32(int(v) - delta)
+				} else {
+					s.hashHead[i] = 0
+				}
+			}
+		}
+	}
+	n := copy(d.window[d.windowEnd:], b)
+	d.windowEnd += n
+	return n
+}
+
+func (d *compressor) writeBlock(tok *tokens, index int, eof bool) error {
+	if index > 0 || eof {
+		var window []byte
+		if d.blockStart <= index {
+			window = d.window[d.blockStart:index]
+		}
+		d.blockStart = index
+		//d.w.writeBlock(tok, eof, window)
+		d.w.writeBlockDynamic(tok, eof, window, d.sync)
+		return d.w.err
+	}
+	return nil
+}
+
+// writeBlockSkip writes the current block and uses the number of tokens
+// to determine if the block should be stored on no matches, or
+// only huffman encoded.
+func (d *compressor) writeBlockSkip(tok *tokens, index int, eof bool) error {
+	if index > 0 || eof {
+		if d.blockStart <= index {
+			window := d.window[d.blockStart:index]
+			// If we removed less than a 64th of all literals
+			// we huffman compress the block.
+			if int(tok.n) > len(window)-int(tok.n>>6) {
+				d.w.writeBlockHuff(eof, window, d.sync)
+			} else {
+				// Write a dynamic huffman block.
+				d.w.writeBlockDynamic(tok, eof, window, d.sync)
+			}
+		} else {
+			d.w.writeBlock(tok, eof, nil)
+		}
+		d.blockStart = index
+		return d.w.err
+	}
+	return nil
+}
+
+// fillWindow will fill the current window with the supplied
+// dictionary and calculate all hashes.
+// This is much faster than doing a full encode.
+// Should only be used after a start/reset.
+func (d *compressor) fillWindow(b []byte) {
+	// Do not fill window if we are in store-only or huffman mode.
+	if d.level <= 0 {
+		return
+	}
+	if d.fast != nil {
+		// encode the last data, but discard the result
+		if len(b) > maxMatchOffset {
+			b = b[len(b)-maxMatchOffset:]
+		}
+		d.fast.Encode(&d.tokens, b)
+		d.tokens.Reset()
+		return
+	}
+	s := d.state
+	// If we are given too much, cut it.
+	if len(b) > windowSize {
+		b = b[len(b)-windowSize:]
+	}
+	// Add all to window.
+	n := copy(d.window[d.windowEnd:], b)
+
+	// Calculate 256 hashes at the time (more L1 cache hits)
+	loops := (n + 256 - minMatchLength) / 256
+	for j := 0; j < loops; j++ {
+		startindex := j * 256
+		end := startindex + 256 + minMatchLength - 1
+		if end > n {
+			end = n
+		}
+		tocheck := d.window[startindex:end]
+		dstSize := len(tocheck) - minMatchLength + 1
+
+		if dstSize <= 0 {
+			continue
+		}
+
+		dst := s.hashMatch[:dstSize]
+		bulkHash4(tocheck, dst)
+		var newH uint32
+		for i, val := range dst {
+			di := i + startindex
+			newH = val & hashMask
+			// Get previous value with the same hash.
+			// Our chain should point to the previous value.
+			s.hashPrev[di&windowMask] = s.hashHead[newH]
+			// Set the head of the hash chain to us.
+			s.hashHead[newH] = uint32(di + s.hashOffset)
+		}
+	}
+	// Update window information.
+	d.windowEnd += n
+	s.index = n
+}
+
+// Try to find a match starting at index whose length is greater than prevSize.
+// We only look at chainCount possibilities before giving up.
+// pos = s.index, prevHead = s.chainHead-s.hashOffset, prevLength=minMatchLength-1, lookahead
+func (d *compressor) findMatch(pos int, prevHead int, lookahead int) (length, offset int, ok bool) {
+	minMatchLook := maxMatchLength
+	if lookahead < minMatchLook {
+		minMatchLook = lookahead
+	}
+
+	win := d.window[0 : pos+minMatchLook]
+
+	// We quit when we get a match that's at least nice long
+	nice := len(win) - pos
+	if d.nice < nice {
+		nice = d.nice
+	}
+
+	// If we've got a match that's good enough, only look in 1/4 the chain.
+	tries := d.chain
+	length = minMatchLength - 1
+
+	wEnd := win[pos+length]
+	wPos := win[pos:]
+	minIndex := pos - windowSize
+	if minIndex < 0 {
+		minIndex = 0
+	}
+	offset = 0
+
+	cGain := 0
+	if d.chain < 100 {
+		for i := prevHead; tries > 0; tries-- {
+			if wEnd == win[i+length] {
+				n := matchLen(win[i:i+minMatchLook], wPos)
+				if n > length {
+					length = n
+					offset = pos - i
+					ok = true
+					if n >= nice {
+						// The match is good enough that we don't try to find a better one.
+						break
+					}
+					wEnd = win[pos+n]
+				}
+			}
+			if i <= minIndex {
+				// hashPrev[i & windowMask] has already been overwritten, so stop now.
+				break
+			}
+			i = int(d.state.hashPrev[i&windowMask]) - d.state.hashOffset
+			if i < minIndex {
+				break
+			}
+		}
+		return
+	}
+
+	// Some like it higher (CSV), some like it lower (JSON)
+	const baseCost = 6
+	// Base is 4 bytes at with an additional cost.
+	// Matches must be better than this.
+	for i := prevHead; tries > 0; tries-- {
+		if wEnd == win[i+length] {
+			n := matchLen(win[i:i+minMatchLook], wPos)
+			if n > length {
+				// Calculate gain. Estimate
+				newGain := d.h.bitLengthRaw(wPos[:n]) - int(offsetExtraBits[offsetCode(uint32(pos-i))]) - baseCost - int(lengthExtraBits[lengthCodes[(n-3)&255]])
+
+				//fmt.Println(n, "gain:", newGain, "prev:", cGain, "raw:", d.h.bitLengthRaw(wPos[:n]))
+				if newGain > cGain {
+					length = n
+					offset = pos - i
+					cGain = newGain
+					ok = true
+					if n >= nice {
+						// The match is good enough that we don't try to find a better one.
+						break
+					}
+					wEnd = win[pos+n]
+				}
+			}
+		}
+		if i <= minIndex {
+			// hashPrev[i & windowMask] has already been overwritten, so stop now.
+			break
+		}
+		i = int(d.state.hashPrev[i&windowMask]) - d.state.hashOffset
+		if i < minIndex {
+			break
+		}
+	}
+	return
+}
+
+func (d *compressor) writeStoredBlock(buf []byte) error {
+	if d.w.writeStoredHeader(len(buf), false); d.w.err != nil {
+		return d.w.err
+	}
+	d.w.writeBytes(buf)
+	return d.w.err
+}
+
+// hash4 returns a hash representation of the first 4 bytes
+// of the supplied slice.
+// The caller must ensure that len(b) >= 4.
+func hash4(b []byte) uint32 {
+	return hash4u(binary.LittleEndian.Uint32(b), hashBits)
+}
+
+// bulkHash4 will compute hashes using the same
+// algorithm as hash4
+func bulkHash4(b []byte, dst []uint32) {
+	if len(b) < 4 {
+		return
+	}
+	hb := binary.LittleEndian.Uint32(b)
+
+	dst[0] = hash4u(hb, hashBits)
+	end := len(b) - 4 + 1
+	for i := 1; i < end; i++ {
+		hb = (hb >> 8) | uint32(b[i+3])<<24
+		dst[i] = hash4u(hb, hashBits)
+	}
+}
+
+func (d *compressor) initDeflate() {
+	d.window = make([]byte, 2*windowSize)
+	d.byteAvailable = false
+	d.err = nil
+	if d.state == nil {
+		return
+	}
+	s := d.state
+	s.index = 0
+	s.hashOffset = 1
+	s.length = minMatchLength - 1
+	s.offset = 0
+	s.chainHead = -1
+}
+
+// deflateLazy is the same as deflate, but with d.fastSkipHashing == skipNever,
+// meaning it always has lazy matching on.
+func (d *compressor) deflateLazy() {
+	s := d.state
+	// Sanity enables additional runtime tests.
+	// It's intended to be used during development
+	// to supplement the currently ad-hoc unit tests.
+	const sanity = debugDeflate
+
+	if d.windowEnd-s.index < minMatchLength+maxMatchLength && !d.sync {
+		return
+	}
+	if d.windowEnd != s.index && d.chain > 100 {
+		// Get literal huffman coder.
+		if d.h == nil {
+			d.h = newHuffmanEncoder(maxFlateBlockTokens)
+		}
+		var tmp [256]uint16
+		for _, v := range d.window[s.index:d.windowEnd] {
+			tmp[v]++
+		}
+		d.h.generate(tmp[:], 15)
+	}
+
+	s.maxInsertIndex = d.windowEnd - (minMatchLength - 1)
+
+	for {
+		if sanity && s.index > d.windowEnd {
+			panic("index > windowEnd")
+		}
+		lookahead := d.windowEnd - s.index
+		if lookahead < minMatchLength+maxMatchLength {
+			if !d.sync {
+				return
+			}
+			if sanity && s.index > d.windowEnd {
+				panic("index > windowEnd")
+			}
+			if lookahead == 0 {
+				// Flush current output block if any.
+				if d.byteAvailable {
+					// There is still one pending token that needs to be flushed
+					d.tokens.AddLiteral(d.window[s.index-1])
+					d.byteAvailable = false
+				}
+				if d.tokens.n > 0 {
+					if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil {
+						return
+					}
+					d.tokens.Reset()
+				}
+				return
+			}
+		}
+		if s.index < s.maxInsertIndex {
+			// Update the hash
+			hash := hash4(d.window[s.index:])
+			ch := s.hashHead[hash]
+			s.chainHead = int(ch)
+			s.hashPrev[s.index&windowMask] = ch
+			s.hashHead[hash] = uint32(s.index + s.hashOffset)
+		}
+		prevLength := s.length
+		prevOffset := s.offset
+		s.length = minMatchLength - 1
+		s.offset = 0
+		minIndex := s.index - windowSize
+		if minIndex < 0 {
+			minIndex = 0
+		}
+
+		if s.chainHead-s.hashOffset >= minIndex && lookahead > prevLength && prevLength < d.lazy {
+			if newLength, newOffset, ok := d.findMatch(s.index, s.chainHead-s.hashOffset, lookahead); ok {
+				s.length = newLength
+				s.offset = newOffset
+			}
+		}
+
+		if prevLength >= minMatchLength && s.length <= prevLength {
+			// Check for better match at end...
+			//
+			// checkOff must be >=2 since we otherwise risk checking s.index
+			// Offset of 2 seems to yield best results.
+			const checkOff = 2
+			prevIndex := s.index - 1
+			if prevIndex+prevLength+checkOff < s.maxInsertIndex {
+				end := lookahead
+				if lookahead > maxMatchLength {
+					end = maxMatchLength
+				}
+				end += prevIndex
+				idx := prevIndex + prevLength - (4 - checkOff)
+				h := hash4(d.window[idx:])
+				ch2 := int(s.hashHead[h]) - s.hashOffset - prevLength + (4 - checkOff)
+				if ch2 > minIndex {
+					length := matchLen(d.window[prevIndex:end], d.window[ch2:])
+					// It seems like a pure length metric is best.
+					if length > prevLength {
+						prevLength = length
+						prevOffset = prevIndex - ch2
+					}
+				}
+			}
+			// There was a match at the previous step, and the current match is
+			// not better. Output the previous match.
+			d.tokens.AddMatch(uint32(prevLength-3), uint32(prevOffset-minOffsetSize))
+
+			// Insert in the hash table all strings up to the end of the match.
+			// index and index-1 are already inserted. If there is not enough
+			// lookahead, the last two strings are not inserted into the hash
+			// table.
+			newIndex := s.index + prevLength - 1
+			// Calculate missing hashes
+			end := newIndex
+			if end > s.maxInsertIndex {
+				end = s.maxInsertIndex
+			}
+			end += minMatchLength - 1
+			startindex := s.index + 1
+			if startindex > s.maxInsertIndex {
+				startindex = s.maxInsertIndex
+			}
+			tocheck := d.window[startindex:end]
+			dstSize := len(tocheck) - minMatchLength + 1
+			if dstSize > 0 {
+				dst := s.hashMatch[:dstSize]
+				bulkHash4(tocheck, dst)
+				var newH uint32
+				for i, val := range dst {
+					di := i + startindex
+					newH = val & hashMask
+					// Get previous value with the same hash.
+					// Our chain should point to the previous value.
+					s.hashPrev[di&windowMask] = s.hashHead[newH]
+					// Set the head of the hash chain to us.
+					s.hashHead[newH] = uint32(di + s.hashOffset)
+				}
+			}
+
+			s.index = newIndex
+			d.byteAvailable = false
+			s.length = minMatchLength - 1
+			if d.tokens.n == maxFlateBlockTokens {
+				// The block includes the current character
+				if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil {
+					return
+				}
+				d.tokens.Reset()
+			}
+			s.ii = 0
+		} else {
+			// Reset, if we got a match this run.
+			if s.length >= minMatchLength {
+				s.ii = 0
+			}
+			// We have a byte waiting. Emit it.
+			if d.byteAvailable {
+				s.ii++
+				d.tokens.AddLiteral(d.window[s.index-1])
+				if d.tokens.n == maxFlateBlockTokens {
+					if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil {
+						return
+					}
+					d.tokens.Reset()
+				}
+				s.index++
+
+				// If we have a long run of no matches, skip additional bytes
+				// Resets when s.ii overflows after 64KB.
+				if n := int(s.ii) - d.chain; n > 0 {
+					n = 1 + int(n>>6)
+					for j := 0; j < n; j++ {
+						if s.index >= d.windowEnd-1 {
+							break
+						}
+						d.tokens.AddLiteral(d.window[s.index-1])
+						if d.tokens.n == maxFlateBlockTokens {
+							if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil {
+								return
+							}
+							d.tokens.Reset()
+						}
+						// Index...
+						if s.index < s.maxInsertIndex {
+							h := hash4(d.window[s.index:])
+							ch := s.hashHead[h]
+							s.chainHead = int(ch)
+							s.hashPrev[s.index&windowMask] = ch
+							s.hashHead[h] = uint32(s.index + s.hashOffset)
+						}
+						s.index++
+					}
+					// Flush last byte
+					d.tokens.AddLiteral(d.window[s.index-1])
+					d.byteAvailable = false
+					// s.length = minMatchLength - 1 // not needed, since s.ii is reset above, so it should never be > minMatchLength
+					if d.tokens.n == maxFlateBlockTokens {
+						if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil {
+							return
+						}
+						d.tokens.Reset()
+					}
+				}
+			} else {
+				s.index++
+				d.byteAvailable = true
+			}
+		}
+	}
+}
+
+func (d *compressor) store() {
+	if d.windowEnd > 0 && (d.windowEnd == maxStoreBlockSize || d.sync) {
+		d.err = d.writeStoredBlock(d.window[:d.windowEnd])
+		d.windowEnd = 0
+	}
+}
+
+// fillWindow will fill the buffer with data for huffman-only compression.
+// The number of bytes copied is returned.
+func (d *compressor) fillBlock(b []byte) int {
+	n := copy(d.window[d.windowEnd:], b)
+	d.windowEnd += n
+	return n
+}
+
+// storeHuff will compress and store the currently added data,
+// if enough has been accumulated or we at the end of the stream.
+// Any error that occurred will be in d.err
+func (d *compressor) storeHuff() {
+	if d.windowEnd < len(d.window) && !d.sync || d.windowEnd == 0 {
+		return
+	}
+	d.w.writeBlockHuff(false, d.window[:d.windowEnd], d.sync)
+	d.err = d.w.err
+	d.windowEnd = 0
+}
+
+// storeFast will compress and store the currently added data,
+// if enough has been accumulated or we at the end of the stream.
+// Any error that occurred will be in d.err
+func (d *compressor) storeFast() {
+	// We only compress if we have maxStoreBlockSize.
+	if d.windowEnd < len(d.window) {
+		if !d.sync {
+			return
+		}
+		// Handle extremely small sizes.
+		if d.windowEnd < 128 {
+			if d.windowEnd == 0 {
+				return
+			}
+			if d.windowEnd <= 32 {
+				d.err = d.writeStoredBlock(d.window[:d.windowEnd])
+			} else {
+				d.w.writeBlockHuff(false, d.window[:d.windowEnd], true)
+				d.err = d.w.err
+			}
+			d.tokens.Reset()
+			d.windowEnd = 0
+			d.fast.Reset()
+			return
+		}
+	}
+
+	d.fast.Encode(&d.tokens, d.window[:d.windowEnd])
+	// If we made zero matches, store the block as is.
+	if d.tokens.n == 0 {
+		d.err = d.writeStoredBlock(d.window[:d.windowEnd])
+		// If we removed less than 1/16th, huffman compress the block.
+	} else if int(d.tokens.n) > d.windowEnd-(d.windowEnd>>4) {
+		d.w.writeBlockHuff(false, d.window[:d.windowEnd], d.sync)
+		d.err = d.w.err
+	} else {
+		d.w.writeBlockDynamic(&d.tokens, false, d.window[:d.windowEnd], d.sync)
+		d.err = d.w.err
+	}
+	d.tokens.Reset()
+	d.windowEnd = 0
+}
+
+// write will add input byte to the stream.
+// Unless an error occurs all bytes will be consumed.
+func (d *compressor) write(b []byte) (n int, err error) {
+	if d.err != nil {
+		return 0, d.err
+	}
+	n = len(b)
+	for len(b) > 0 {
+		if d.windowEnd == len(d.window) || d.sync {
+			d.step(d)
+		}
+		b = b[d.fill(d, b):]
+		if d.err != nil {
+			return 0, d.err
+		}
+	}
+	return n, d.err
+}
+
+func (d *compressor) syncFlush() error {
+	d.sync = true
+	if d.err != nil {
+		return d.err
+	}
+	d.step(d)
+	if d.err == nil {
+		d.w.writeStoredHeader(0, false)
+		d.w.flush()
+		d.err = d.w.err
+	}
+	d.sync = false
+	return d.err
+}
+
+func (d *compressor) init(w io.Writer, level int) (err error) {
+	d.w = newHuffmanBitWriter(w)
+
+	switch {
+	case level == NoCompression:
+		d.window = make([]byte, maxStoreBlockSize)
+		d.fill = (*compressor).fillBlock
+		d.step = (*compressor).store
+	case level == ConstantCompression:
+		d.w.logNewTablePenalty = 10
+		d.window = make([]byte, 32<<10)
+		d.fill = (*compressor).fillBlock
+		d.step = (*compressor).storeHuff
+	case level == DefaultCompression:
+		level = 5
+		fallthrough
+	case level >= 1 && level <= 6:
+		d.w.logNewTablePenalty = 7
+		d.fast = newFastEnc(level)
+		d.window = make([]byte, maxStoreBlockSize)
+		d.fill = (*compressor).fillBlock
+		d.step = (*compressor).storeFast
+	case 7 <= level && level <= 9:
+		d.w.logNewTablePenalty = 8
+		d.state = &advancedState{}
+		d.compressionLevel = levels[level]
+		d.initDeflate()
+		d.fill = (*compressor).fillDeflate
+		d.step = (*compressor).deflateLazy
+	default:
+		return fmt.Errorf("flate: invalid compression level %d: want value in range [-2, 9]", level)
+	}
+	d.level = level
+	return nil
+}
+
+// reset the state of the compressor.
+func (d *compressor) reset(w io.Writer) {
+	d.w.reset(w)
+	d.sync = false
+	d.err = nil
+	// We only need to reset a few things for Snappy.
+	if d.fast != nil {
+		d.fast.Reset()
+		d.windowEnd = 0
+		d.tokens.Reset()
+		return
+	}
+	switch d.compressionLevel.chain {
+	case 0:
+		// level was NoCompression or ConstantCompresssion.
+		d.windowEnd = 0
+	default:
+		s := d.state
+		s.chainHead = -1
+		for i := range s.hashHead {
+			s.hashHead[i] = 0
+		}
+		for i := range s.hashPrev {
+			s.hashPrev[i] = 0
+		}
+		s.hashOffset = 1
+		s.index, d.windowEnd = 0, 0
+		d.blockStart, d.byteAvailable = 0, false
+		d.tokens.Reset()
+		s.length = minMatchLength - 1
+		s.offset = 0
+		s.ii = 0
+		s.maxInsertIndex = 0
+	}
+}
+
+func (d *compressor) close() error {
+	if d.err != nil {
+		return d.err
+	}
+	d.sync = true
+	d.step(d)
+	if d.err != nil {
+		return d.err
+	}
+	if d.w.writeStoredHeader(0, true); d.w.err != nil {
+		return d.w.err
+	}
+	d.w.flush()
+	d.w.reset(nil)
+	return d.w.err
+}
+
+// NewWriter returns a new Writer compressing data at the given level.
+// Following zlib, levels range from 1 (BestSpeed) to 9 (BestCompression);
+// higher levels typically run slower but compress more.
+// Level 0 (NoCompression) does not attempt any compression; it only adds the
+// necessary DEFLATE framing.
+// Level -1 (DefaultCompression) uses the default compression level.
+// Level -2 (ConstantCompression) will use Huffman compression only, giving
+// a very fast compression for all types of input, but sacrificing considerable
+// compression efficiency.
+//
+// If level is in the range [-2, 9] then the error returned will be nil.
+// Otherwise the error returned will be non-nil.
+func NewWriter(w io.Writer, level int) (*Writer, error) {
+	var dw Writer
+	if err := dw.d.init(w, level); err != nil {
+		return nil, err
+	}
+	return &dw, nil
+}
+
+// NewWriterDict is like NewWriter but initializes the new
+// Writer with a preset dictionary.  The returned Writer behaves
+// as if the dictionary had been written to it without producing
+// any compressed output.  The compressed data written to w
+// can only be decompressed by a Reader initialized with the
+// same dictionary.
+func NewWriterDict(w io.Writer, level int, dict []byte) (*Writer, error) {
+	zw, err := NewWriter(w, level)
+	if err != nil {
+		return nil, err
+	}
+	zw.d.fillWindow(dict)
+	zw.dict = append(zw.dict, dict...) // duplicate dictionary for Reset method.
+	return zw, err
+}
+
+// A Writer takes data written to it and writes the compressed
+// form of that data to an underlying writer (see NewWriter).
+type Writer struct {
+	d    compressor
+	dict []byte
+}
+
+// Write writes data to w, which will eventually write the
+// compressed form of data to its underlying writer.
+func (w *Writer) Write(data []byte) (n int, err error) {
+	return w.d.write(data)
+}
+
+// Flush flushes any pending data to the underlying writer.
+// It is useful mainly in compressed network protocols, to ensure that
+// a remote reader has enough data to reconstruct a packet.
+// Flush does not return until the data has been written.
+// Calling Flush when there is no pending data still causes the Writer
+// to emit a sync marker of at least 4 bytes.
+// If the underlying writer returns an error, Flush returns that error.
+//
+// In the terminology of the zlib library, Flush is equivalent to Z_SYNC_FLUSH.
+func (w *Writer) Flush() error {
+	// For more about flushing:
+	// http://www.bolet.org/~pornin/deflate-flush.html
+	return w.d.syncFlush()
+}
+
+// Close flushes and closes the writer.
+func (w *Writer) Close() error {
+	return w.d.close()
+}
+
+// Reset discards the writer's state and makes it equivalent to
+// the result of NewWriter or NewWriterDict called with dst
+// and w's level and dictionary.
+func (w *Writer) Reset(dst io.Writer) {
+	if len(w.dict) > 0 {
+		// w was created with NewWriterDict
+		w.d.reset(dst)
+		if dst != nil {
+			w.d.fillWindow(w.dict)
+		}
+	} else {
+		// w was created with NewWriter
+		w.d.reset(dst)
+	}
+}
+
+// ResetDict discards the writer's state and makes it equivalent to
+// the result of NewWriter or NewWriterDict called with dst
+// and w's level, but sets a specific dictionary.
+func (w *Writer) ResetDict(dst io.Writer, dict []byte) {
+	w.dict = dict
+	w.d.reset(dst)
+	w.d.fillWindow(w.dict)
+}
diff --git a/vendor/github.com/klauspost/compress/flate/dict_decoder.go b/vendor/github.com/klauspost/compress/flate/dict_decoder.go
new file mode 100644
index 000000000..71c75a065
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/dict_decoder.go
@@ -0,0 +1,184 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package flate
+
+// dictDecoder implements the LZ77 sliding dictionary as used in decompression.
+// LZ77 decompresses data through sequences of two forms of commands:
+//
+//	* Literal insertions: Runs of one or more symbols are inserted into the data
+//	stream as is. This is accomplished through the writeByte method for a
+//	single symbol, or combinations of writeSlice/writeMark for multiple symbols.
+//	Any valid stream must start with a literal insertion if no preset dictionary
+//	is used.
+//
+//	* Backward copies: Runs of one or more symbols are copied from previously
+//	emitted data. Backward copies come as the tuple (dist, length) where dist
+//	determines how far back in the stream to copy from and length determines how
+//	many bytes to copy. Note that it is valid for the length to be greater than
+//	the distance. Since LZ77 uses forward copies, that situation is used to
+//	perform a form of run-length encoding on repeated runs of symbols.
+//	The writeCopy and tryWriteCopy are used to implement this command.
+//
+// For performance reasons, this implementation performs little to no sanity
+// checks about the arguments. As such, the invariants documented for each
+// method call must be respected.
+type dictDecoder struct {
+	hist []byte // Sliding window history
+
+	// Invariant: 0 <= rdPos <= wrPos <= len(hist)
+	wrPos int  // Current output position in buffer
+	rdPos int  // Have emitted hist[:rdPos] already
+	full  bool // Has a full window length been written yet?
+}
+
+// init initializes dictDecoder to have a sliding window dictionary of the given
+// size. If a preset dict is provided, it will initialize the dictionary with
+// the contents of dict.
+func (dd *dictDecoder) init(size int, dict []byte) {
+	*dd = dictDecoder{hist: dd.hist}
+
+	if cap(dd.hist) < size {
+		dd.hist = make([]byte, size)
+	}
+	dd.hist = dd.hist[:size]
+
+	if len(dict) > len(dd.hist) {
+		dict = dict[len(dict)-len(dd.hist):]
+	}
+	dd.wrPos = copy(dd.hist, dict)
+	if dd.wrPos == len(dd.hist) {
+		dd.wrPos = 0
+		dd.full = true
+	}
+	dd.rdPos = dd.wrPos
+}
+
+// histSize reports the total amount of historical data in the dictionary.
+func (dd *dictDecoder) histSize() int {
+	if dd.full {
+		return len(dd.hist)
+	}
+	return dd.wrPos
+}
+
+// availRead reports the number of bytes that can be flushed by readFlush.
+func (dd *dictDecoder) availRead() int {
+	return dd.wrPos - dd.rdPos
+}
+
+// availWrite reports the available amount of output buffer space.
+func (dd *dictDecoder) availWrite() int {
+	return len(dd.hist) - dd.wrPos
+}
+
+// writeSlice returns a slice of the available buffer to write data to.
+//
+// This invariant will be kept: len(s) <= availWrite()
+func (dd *dictDecoder) writeSlice() []byte {
+	return dd.hist[dd.wrPos:]
+}
+
+// writeMark advances the writer pointer by cnt.
+//
+// This invariant must be kept: 0 <= cnt <= availWrite()
+func (dd *dictDecoder) writeMark(cnt int) {
+	dd.wrPos += cnt
+}
+
+// writeByte writes a single byte to the dictionary.
+//
+// This invariant must be kept: 0 < availWrite()
+func (dd *dictDecoder) writeByte(c byte) {
+	dd.hist[dd.wrPos] = c
+	dd.wrPos++
+}
+
+// writeCopy copies a string at a given (dist, length) to the output.
+// This returns the number of bytes copied and may be less than the requested
+// length if the available space in the output buffer is too small.
+//
+// This invariant must be kept: 0 < dist <= histSize()
+func (dd *dictDecoder) writeCopy(dist, length int) int {
+	dstBase := dd.wrPos
+	dstPos := dstBase
+	srcPos := dstPos - dist
+	endPos := dstPos + length
+	if endPos > len(dd.hist) {
+		endPos = len(dd.hist)
+	}
+
+	// Copy non-overlapping section after destination position.
+	//
+	// This section is non-overlapping in that the copy length for this section
+	// is always less than or equal to the backwards distance. This can occur
+	// if a distance refers to data that wraps-around in the buffer.
+	// Thus, a backwards copy is performed here; that is, the exact bytes in
+	// the source prior to the copy is placed in the destination.
+	if srcPos < 0 {
+		srcPos += len(dd.hist)
+		dstPos += copy(dd.hist[dstPos:endPos], dd.hist[srcPos:])
+		srcPos = 0
+	}
+
+	// Copy possibly overlapping section before destination position.
+	//
+	// This section can overlap if the copy length for this section is larger
+	// than the backwards distance. This is allowed by LZ77 so that repeated
+	// strings can be succinctly represented using (dist, length) pairs.
+	// Thus, a forwards copy is performed here; that is, the bytes copied is
+	// possibly dependent on the resulting bytes in the destination as the copy
+	// progresses along. This is functionally equivalent to the following:
+	//
+	//	for i := 0; i < endPos-dstPos; i++ {
+	//		dd.hist[dstPos+i] = dd.hist[srcPos+i]
+	//	}
+	//	dstPos = endPos
+	//
+	for dstPos < endPos {
+		dstPos += copy(dd.hist[dstPos:endPos], dd.hist[srcPos:dstPos])
+	}
+
+	dd.wrPos = dstPos
+	return dstPos - dstBase
+}
+
+// tryWriteCopy tries to copy a string at a given (distance, length) to the
+// output. This specialized version is optimized for short distances.
+//
+// This method is designed to be inlined for performance reasons.
+//
+// This invariant must be kept: 0 < dist <= histSize()
+func (dd *dictDecoder) tryWriteCopy(dist, length int) int {
+	dstPos := dd.wrPos
+	endPos := dstPos + length
+	if dstPos < dist || endPos > len(dd.hist) {
+		return 0
+	}
+	dstBase := dstPos
+	srcPos := dstPos - dist
+
+	// Copy possibly overlapping section before destination position.
+loop:
+	dstPos += copy(dd.hist[dstPos:endPos], dd.hist[srcPos:dstPos])
+	if dstPos < endPos {
+		goto loop // Avoid for-loop so that this function can be inlined
+	}
+
+	dd.wrPos = dstPos
+	return dstPos - dstBase
+}
+
+// readFlush returns a slice of the historical buffer that is ready to be
+// emitted to the user. The data returned by readFlush must be fully consumed
+// before calling any other dictDecoder methods.
+func (dd *dictDecoder) readFlush() []byte {
+	toRead := dd.hist[dd.rdPos:dd.wrPos]
+	dd.rdPos = dd.wrPos
+	if dd.wrPos == len(dd.hist) {
+		dd.wrPos, dd.rdPos = 0, 0
+		dd.full = true
+	}
+	return toRead
+}
diff --git a/vendor/github.com/klauspost/compress/flate/fast_encoder.go b/vendor/github.com/klauspost/compress/flate/fast_encoder.go
new file mode 100644
index 000000000..f781aaa62
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/fast_encoder.go
@@ -0,0 +1,233 @@
+// Copyright 2011 The Snappy-Go Authors. All rights reserved.
+// Modified for deflate by Klaus Post (c) 2015.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package flate
+
+import (
+	"encoding/binary"
+	"fmt"
+	"math/bits"
+)
+
+type fastEnc interface {
+	Encode(dst *tokens, src []byte)
+	Reset()
+}
+
+func newFastEnc(level int) fastEnc {
+	switch level {
+	case 1:
+		return &fastEncL1{fastGen: fastGen{cur: maxStoreBlockSize}}
+	case 2:
+		return &fastEncL2{fastGen: fastGen{cur: maxStoreBlockSize}}
+	case 3:
+		return &fastEncL3{fastGen: fastGen{cur: maxStoreBlockSize}}
+	case 4:
+		return &fastEncL4{fastGen: fastGen{cur: maxStoreBlockSize}}
+	case 5:
+		return &fastEncL5{fastGen: fastGen{cur: maxStoreBlockSize}}
+	case 6:
+		return &fastEncL6{fastGen: fastGen{cur: maxStoreBlockSize}}
+	default:
+		panic("invalid level specified")
+	}
+}
+
+const (
+	tableBits       = 15             // Bits used in the table
+	tableSize       = 1 << tableBits // Size of the table
+	tableShift      = 32 - tableBits // Right-shift to get the tableBits most significant bits of a uint32.
+	baseMatchOffset = 1              // The smallest match offset
+	baseMatchLength = 3              // The smallest match length per the RFC section 3.2.5
+	maxMatchOffset  = 1 << 15        // The largest match offset
+
+	bTableBits   = 17                                               // Bits used in the big tables
+	bTableSize   = 1 << bTableBits                                  // Size of the table
+	allocHistory = maxStoreBlockSize * 5                            // Size to preallocate for history.
+	bufferReset  = (1 << 31) - allocHistory - maxStoreBlockSize - 1 // Reset the buffer offset when reaching this.
+)
+
+const (
+	prime3bytes = 506832829
+	prime4bytes = 2654435761
+	prime5bytes = 889523592379
+	prime6bytes = 227718039650203
+	prime7bytes = 58295818150454627
+	prime8bytes = 0xcf1bbcdcb7a56463
+)
+
+func load32(b []byte, i int) uint32 {
+	// Help the compiler eliminate bounds checks on the read so it can be done in a single read.
+	b = b[i:]
+	b = b[:4]
+	return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
+}
+
+func load64(b []byte, i int) uint64 {
+	return binary.LittleEndian.Uint64(b[i:])
+}
+
+func load3232(b []byte, i int32) uint32 {
+	return binary.LittleEndian.Uint32(b[i:])
+}
+
+func load6432(b []byte, i int32) uint64 {
+	return binary.LittleEndian.Uint64(b[i:])
+}
+
+func hash(u uint32) uint32 {
+	return (u * 0x1e35a7bd) >> tableShift
+}
+
+type tableEntry struct {
+	offset int32
+}
+
+// fastGen maintains the table for matches,
+// and the previous byte block for level 2.
+// This is the generic implementation.
+type fastGen struct {
+	hist []byte
+	cur  int32
+}
+
+func (e *fastGen) addBlock(src []byte) int32 {
+	// check if we have space already
+	if len(e.hist)+len(src) > cap(e.hist) {
+		if cap(e.hist) == 0 {
+			e.hist = make([]byte, 0, allocHistory)
+		} else {
+			if cap(e.hist) < maxMatchOffset*2 {
+				panic("unexpected buffer size")
+			}
+			// Move down
+			offset := int32(len(e.hist)) - maxMatchOffset
+			copy(e.hist[0:maxMatchOffset], e.hist[offset:])
+			e.cur += offset
+			e.hist = e.hist[:maxMatchOffset]
+		}
+	}
+	s := int32(len(e.hist))
+	e.hist = append(e.hist, src...)
+	return s
+}
+
+// hash4 returns the hash of u to fit in a hash table with h bits.
+// Preferably h should be a constant and should always be <32.
+func hash4u(u uint32, h uint8) uint32 {
+	return (u * prime4bytes) >> (32 - h)
+}
+
+type tableEntryPrev struct {
+	Cur  tableEntry
+	Prev tableEntry
+}
+
+// hash4x64 returns the hash of the lowest 4 bytes of u to fit in a hash table with h bits.
+// Preferably h should be a constant and should always be <32.
+func hash4x64(u uint64, h uint8) uint32 {
+	return (uint32(u) * prime4bytes) >> ((32 - h) & reg8SizeMask32)
+}
+
+// hash7 returns the hash of the lowest 7 bytes of u to fit in a hash table with h bits.
+// Preferably h should be a constant and should always be <64.
+func hash7(u uint64, h uint8) uint32 {
+	return uint32(((u << (64 - 56)) * prime7bytes) >> ((64 - h) & reg8SizeMask64))
+}
+
+// hash8 returns the hash of u to fit in a hash table with h bits.
+// Preferably h should be a constant and should always be <64.
+func hash8(u uint64, h uint8) uint32 {
+	return uint32((u * prime8bytes) >> ((64 - h) & reg8SizeMask64))
+}
+
+// hash6 returns the hash of the lowest 6 bytes of u to fit in a hash table with h bits.
+// Preferably h should be a constant and should always be <64.
+func hash6(u uint64, h uint8) uint32 {
+	return uint32(((u << (64 - 48)) * prime6bytes) >> ((64 - h) & reg8SizeMask64))
+}
+
+// matchlen will return the match length between offsets and t in src.
+// The maximum length returned is maxMatchLength - 4.
+// It is assumed that s > t, that t >=0 and s < len(src).
+func (e *fastGen) matchlen(s, t int32, src []byte) int32 {
+	if debugDecode {
+		if t >= s {
+			panic(fmt.Sprint("t >=s:", t, s))
+		}
+		if int(s) >= len(src) {
+			panic(fmt.Sprint("s >= len(src):", s, len(src)))
+		}
+		if t < 0 {
+			panic(fmt.Sprint("t < 0:", t))
+		}
+		if s-t > maxMatchOffset {
+			panic(fmt.Sprint(s, "-", t, "(", s-t, ") > maxMatchLength (", maxMatchOffset, ")"))
+		}
+	}
+	s1 := int(s) + maxMatchLength - 4
+	if s1 > len(src) {
+		s1 = len(src)
+	}
+
+	// Extend the match to be as long as possible.
+	return int32(matchLen(src[s:s1], src[t:]))
+}
+
+// matchlenLong will return the match length between offsets and t in src.
+// It is assumed that s > t, that t >=0 and s < len(src).
+func (e *fastGen) matchlenLong(s, t int32, src []byte) int32 {
+	if debugDeflate {
+		if t >= s {
+			panic(fmt.Sprint("t >=s:", t, s))
+		}
+		if int(s) >= len(src) {
+			panic(fmt.Sprint("s >= len(src):", s, len(src)))
+		}
+		if t < 0 {
+			panic(fmt.Sprint("t < 0:", t))
+		}
+		if s-t > maxMatchOffset {
+			panic(fmt.Sprint(s, "-", t, "(", s-t, ") > maxMatchLength (", maxMatchOffset, ")"))
+		}
+	}
+	// Extend the match to be as long as possible.
+	return int32(matchLen(src[s:], src[t:]))
+}
+
+// Reset the encoding table.
+func (e *fastGen) Reset() {
+	if cap(e.hist) < allocHistory {
+		e.hist = make([]byte, 0, allocHistory)
+	}
+	// We offset current position so everything will be out of reach.
+	// If we are above the buffer reset it will be cleared anyway since len(hist) == 0.
+	if e.cur <= bufferReset {
+		e.cur += maxMatchOffset + int32(len(e.hist))
+	}
+	e.hist = e.hist[:0]
+}
+
+// matchLen returns the maximum length.
+// 'a' must be the shortest of the two.
+func matchLen(a, b []byte) int {
+	var checked int
+
+	for len(a) >= 8 {
+		if diff := binary.LittleEndian.Uint64(a) ^ binary.LittleEndian.Uint64(b); diff != 0 {
+			return checked + (bits.TrailingZeros64(diff) >> 3)
+		}
+		checked += 8
+		a = a[8:]
+		b = b[8:]
+	}
+	b = b[:len(a)]
+	for i := range a {
+		if a[i] != b[i] {
+			return i + checked
+		}
+	}
+	return len(a) + checked
+}
diff --git a/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go b/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go
new file mode 100644
index 000000000..40ef45c2f
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go
@@ -0,0 +1,1185 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package flate
+
+import (
+	"encoding/binary"
+	"fmt"
+	"io"
+	"math"
+)
+
+const (
+	// The largest offset code.
+	offsetCodeCount = 30
+
+	// The special code used to mark the end of a block.
+	endBlockMarker = 256
+
+	// The first length code.
+	lengthCodesStart = 257
+
+	// The number of codegen codes.
+	codegenCodeCount = 19
+	badCode          = 255
+
+	// maxPredefinedTokens is the maximum number of tokens
+	// where we check if fixed size is smaller.
+	maxPredefinedTokens = 250
+
+	// bufferFlushSize indicates the buffer size
+	// after which bytes are flushed to the writer.
+	// Should preferably be a multiple of 6, since
+	// we accumulate 6 bytes between writes to the buffer.
+	bufferFlushSize = 246
+
+	// bufferSize is the actual output byte buffer size.
+	// It must have additional headroom for a flush
+	// which can contain up to 8 bytes.
+	bufferSize = bufferFlushSize + 8
+)
+
+// Minimum length code that emits bits.
+const lengthExtraBitsMinCode = 8
+
+// The number of extra bits needed by length code X - LENGTH_CODES_START.
+var lengthExtraBits = [32]uint8{
+	/* 257 */ 0, 0, 0,
+	/* 260 */ 0, 0, 0, 0, 0, 1, 1, 1, 1, 2,
+	/* 270 */ 2, 2, 2, 3, 3, 3, 3, 4, 4, 4,
+	/* 280 */ 4, 5, 5, 5, 5, 0,
+}
+
+// The length indicated by length code X - LENGTH_CODES_START.
+var lengthBase = [32]uint8{
+	0, 1, 2, 3, 4, 5, 6, 7, 8, 10,
+	12, 14, 16, 20, 24, 28, 32, 40, 48, 56,
+	64, 80, 96, 112, 128, 160, 192, 224, 255,
+}
+
+// Minimum offset code that emits bits.
+const offsetExtraBitsMinCode = 4
+
+// offset code word extra bits.
+var offsetExtraBits = [32]int8{
+	0, 0, 0, 0, 1, 1, 2, 2, 3, 3,
+	4, 4, 5, 5, 6, 6, 7, 7, 8, 8,
+	9, 9, 10, 10, 11, 11, 12, 12, 13, 13,
+	/* extended window */
+	14, 14,
+}
+
+var offsetCombined = [32]uint32{}
+
+func init() {
+	var offsetBase = [32]uint32{
+		/* normal deflate */
+		0x000000, 0x000001, 0x000002, 0x000003, 0x000004,
+		0x000006, 0x000008, 0x00000c, 0x000010, 0x000018,
+		0x000020, 0x000030, 0x000040, 0x000060, 0x000080,
+		0x0000c0, 0x000100, 0x000180, 0x000200, 0x000300,
+		0x000400, 0x000600, 0x000800, 0x000c00, 0x001000,
+		0x001800, 0x002000, 0x003000, 0x004000, 0x006000,
+
+		/* extended window */
+		0x008000, 0x00c000,
+	}
+
+	for i := range offsetCombined[:] {
+		// Don't use extended window values...
+		if offsetExtraBits[i] == 0 || offsetBase[i] > 0x006000 {
+			continue
+		}
+		offsetCombined[i] = uint32(offsetExtraBits[i]) | (offsetBase[i] << 8)
+	}
+}
+
+// The odd order in which the codegen code sizes are written.
+var codegenOrder = []uint32{16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15}
+
+type huffmanBitWriter struct {
+	// writer is the underlying writer.
+	// Do not use it directly; use the write method, which ensures
+	// that Write errors are sticky.
+	writer io.Writer
+
+	// Data waiting to be written is bytes[0:nbytes]
+	// and then the low nbits of bits.
+	bits            uint64
+	nbits           uint8
+	nbytes          uint8
+	lastHuffMan     bool
+	literalEncoding *huffmanEncoder
+	tmpLitEncoding  *huffmanEncoder
+	offsetEncoding  *huffmanEncoder
+	codegenEncoding *huffmanEncoder
+	err             error
+	lastHeader      int
+	// Set between 0 (reused block can be up to 2x the size)
+	logNewTablePenalty uint
+	bytes              [256 + 8]byte
+	literalFreq        [lengthCodesStart + 32]uint16
+	offsetFreq         [32]uint16
+	codegenFreq        [codegenCodeCount]uint16
+
+	// codegen must have an extra space for the final symbol.
+	codegen [literalCount + offsetCodeCount + 1]uint8
+}
+
+// Huffman reuse.
+//
+// The huffmanBitWriter supports reusing huffman tables and thereby combining block sections.
+//
+// This is controlled by several variables:
+//
+// If lastHeader is non-zero the Huffman table can be reused.
+// This also indicates that a Huffman table has been generated that can output all
+// possible symbols.
+// It also indicates that an EOB has not yet been emitted, so if a new tabel is generated
+// an EOB with the previous table must be written.
+//
+// If lastHuffMan is set, a table for outputting literals has been generated and offsets are invalid.
+//
+// An incoming block estimates the output size of a new table using a 'fresh' by calculating the
+// optimal size and adding a penalty in 'logNewTablePenalty'.
+// A Huffman table is not optimal, which is why we add a penalty, and generating a new table
+// is slower both for compression and decompression.
+
+func newHuffmanBitWriter(w io.Writer) *huffmanBitWriter {
+	return &huffmanBitWriter{
+		writer:          w,
+		literalEncoding: newHuffmanEncoder(literalCount),
+		tmpLitEncoding:  newHuffmanEncoder(literalCount),
+		codegenEncoding: newHuffmanEncoder(codegenCodeCount),
+		offsetEncoding:  newHuffmanEncoder(offsetCodeCount),
+	}
+}
+
+func (w *huffmanBitWriter) reset(writer io.Writer) {
+	w.writer = writer
+	w.bits, w.nbits, w.nbytes, w.err = 0, 0, 0, nil
+	w.lastHeader = 0
+	w.lastHuffMan = false
+}
+
+func (w *huffmanBitWriter) canReuse(t *tokens) (ok bool) {
+	a := t.offHist[:offsetCodeCount]
+	b := w.offsetEncoding.codes
+	b = b[:len(a)]
+	for i, v := range a {
+		if v != 0 && b[i].zero() {
+			return false
+		}
+	}
+
+	a = t.extraHist[:literalCount-256]
+	b = w.literalEncoding.codes[256:literalCount]
+	b = b[:len(a)]
+	for i, v := range a {
+		if v != 0 && b[i].zero() {
+			return false
+		}
+	}
+
+	a = t.litHist[:256]
+	b = w.literalEncoding.codes[:len(a)]
+	for i, v := range a {
+		if v != 0 && b[i].zero() {
+			return false
+		}
+	}
+	return true
+}
+
+func (w *huffmanBitWriter) flush() {
+	if w.err != nil {
+		w.nbits = 0
+		return
+	}
+	if w.lastHeader > 0 {
+		// We owe an EOB
+		w.writeCode(w.literalEncoding.codes[endBlockMarker])
+		w.lastHeader = 0
+	}
+	n := w.nbytes
+	for w.nbits != 0 {
+		w.bytes[n] = byte(w.bits)
+		w.bits >>= 8
+		if w.nbits > 8 { // Avoid underflow
+			w.nbits -= 8
+		} else {
+			w.nbits = 0
+		}
+		n++
+	}
+	w.bits = 0
+	w.write(w.bytes[:n])
+	w.nbytes = 0
+}
+
+func (w *huffmanBitWriter) write(b []byte) {
+	if w.err != nil {
+		return
+	}
+	_, w.err = w.writer.Write(b)
+}
+
+func (w *huffmanBitWriter) writeBits(b int32, nb uint8) {
+	w.bits |= uint64(b) << (w.nbits & 63)
+	w.nbits += nb
+	if w.nbits >= 48 {
+		w.writeOutBits()
+	}
+}
+
+func (w *huffmanBitWriter) writeBytes(bytes []byte) {
+	if w.err != nil {
+		return
+	}
+	n := w.nbytes
+	if w.nbits&7 != 0 {
+		w.err = InternalError("writeBytes with unfinished bits")
+		return
+	}
+	for w.nbits != 0 {
+		w.bytes[n] = byte(w.bits)
+		w.bits >>= 8
+		w.nbits -= 8
+		n++
+	}
+	if n != 0 {
+		w.write(w.bytes[:n])
+	}
+	w.nbytes = 0
+	w.write(bytes)
+}
+
+// RFC 1951 3.2.7 specifies a special run-length encoding for specifying
+// the literal and offset lengths arrays (which are concatenated into a single
+// array).  This method generates that run-length encoding.
+//
+// The result is written into the codegen array, and the frequencies
+// of each code is written into the codegenFreq array.
+// Codes 0-15 are single byte codes. Codes 16-18 are followed by additional
+// information. Code badCode is an end marker
+//
+//  numLiterals      The number of literals in literalEncoding
+//  numOffsets       The number of offsets in offsetEncoding
+//  litenc, offenc   The literal and offset encoder to use
+func (w *huffmanBitWriter) generateCodegen(numLiterals int, numOffsets int, litEnc, offEnc *huffmanEncoder) {
+	for i := range w.codegenFreq {
+		w.codegenFreq[i] = 0
+	}
+	// Note that we are using codegen both as a temporary variable for holding
+	// a copy of the frequencies, and as the place where we put the result.
+	// This is fine because the output is always shorter than the input used
+	// so far.
+	codegen := w.codegen[:] // cache
+	// Copy the concatenated code sizes to codegen. Put a marker at the end.
+	cgnl := codegen[:numLiterals]
+	for i := range cgnl {
+		cgnl[i] = litEnc.codes[i].len()
+	}
+
+	cgnl = codegen[numLiterals : numLiterals+numOffsets]
+	for i := range cgnl {
+		cgnl[i] = offEnc.codes[i].len()
+	}
+	codegen[numLiterals+numOffsets] = badCode
+
+	size := codegen[0]
+	count := 1
+	outIndex := 0
+	for inIndex := 1; size != badCode; inIndex++ {
+		// INVARIANT: We have seen "count" copies of size that have not yet
+		// had output generated for them.
+		nextSize := codegen[inIndex]
+		if nextSize == size {
+			count++
+			continue
+		}
+		// We need to generate codegen indicating "count" of size.
+		if size != 0 {
+			codegen[outIndex] = size
+			outIndex++
+			w.codegenFreq[size]++
+			count--
+			for count >= 3 {
+				n := 6
+				if n > count {
+					n = count
+				}
+				codegen[outIndex] = 16
+				outIndex++
+				codegen[outIndex] = uint8(n - 3)
+				outIndex++
+				w.codegenFreq[16]++
+				count -= n
+			}
+		} else {
+			for count >= 11 {
+				n := 138
+				if n > count {
+					n = count
+				}
+				codegen[outIndex] = 18
+				outIndex++
+				codegen[outIndex] = uint8(n - 11)
+				outIndex++
+				w.codegenFreq[18]++
+				count -= n
+			}
+			if count >= 3 {
+				// count >= 3 && count <= 10
+				codegen[outIndex] = 17
+				outIndex++
+				codegen[outIndex] = uint8(count - 3)
+				outIndex++
+				w.codegenFreq[17]++
+				count = 0
+			}
+		}
+		count--
+		for ; count >= 0; count-- {
+			codegen[outIndex] = size
+			outIndex++
+			w.codegenFreq[size]++
+		}
+		// Set up invariant for next time through the loop.
+		size = nextSize
+		count = 1
+	}
+	// Marker indicating the end of the codegen.
+	codegen[outIndex] = badCode
+}
+
+func (w *huffmanBitWriter) codegens() int {
+	numCodegens := len(w.codegenFreq)
+	for numCodegens > 4 && w.codegenFreq[codegenOrder[numCodegens-1]] == 0 {
+		numCodegens--
+	}
+	return numCodegens
+}
+
+func (w *huffmanBitWriter) headerSize() (size, numCodegens int) {
+	numCodegens = len(w.codegenFreq)
+	for numCodegens > 4 && w.codegenFreq[codegenOrder[numCodegens-1]] == 0 {
+		numCodegens--
+	}
+	return 3 + 5 + 5 + 4 + (3 * numCodegens) +
+		w.codegenEncoding.bitLength(w.codegenFreq[:]) +
+		int(w.codegenFreq[16])*2 +
+		int(w.codegenFreq[17])*3 +
+		int(w.codegenFreq[18])*7, numCodegens
+}
+
+// dynamicSize returns the size of dynamically encoded data in bits.
+func (w *huffmanBitWriter) dynamicReuseSize(litEnc, offEnc *huffmanEncoder) (size int) {
+	size = litEnc.bitLength(w.literalFreq[:]) +
+		offEnc.bitLength(w.offsetFreq[:])
+	return size
+}
+
+// dynamicSize returns the size of dynamically encoded data in bits.
+func (w *huffmanBitWriter) dynamicSize(litEnc, offEnc *huffmanEncoder, extraBits int) (size, numCodegens int) {
+	header, numCodegens := w.headerSize()
+	size = header +
+		litEnc.bitLength(w.literalFreq[:]) +
+		offEnc.bitLength(w.offsetFreq[:]) +
+		extraBits
+	return size, numCodegens
+}
+
+// extraBitSize will return the number of bits that will be written
+// as "extra" bits on matches.
+func (w *huffmanBitWriter) extraBitSize() int {
+	total := 0
+	for i, n := range w.literalFreq[257:literalCount] {
+		total += int(n) * int(lengthExtraBits[i&31])
+	}
+	for i, n := range w.offsetFreq[:offsetCodeCount] {
+		total += int(n) * int(offsetExtraBits[i&31])
+	}
+	return total
+}
+
+// fixedSize returns the size of dynamically encoded data in bits.
+func (w *huffmanBitWriter) fixedSize(extraBits int) int {
+	return 3 +
+		fixedLiteralEncoding.bitLength(w.literalFreq[:]) +
+		fixedOffsetEncoding.bitLength(w.offsetFreq[:]) +
+		extraBits
+}
+
+// storedSize calculates the stored size, including header.
+// The function returns the size in bits and whether the block
+// fits inside a single block.
+func (w *huffmanBitWriter) storedSize(in []byte) (int, bool) {
+	if in == nil {
+		return 0, false
+	}
+	if len(in) <= maxStoreBlockSize {
+		return (len(in) + 5) * 8, true
+	}
+	return 0, false
+}
+
+func (w *huffmanBitWriter) writeCode(c hcode) {
+	// The function does not get inlined if we "& 63" the shift.
+	w.bits |= c.code64() << (w.nbits & 63)
+	w.nbits += c.len()
+	if w.nbits >= 48 {
+		w.writeOutBits()
+	}
+}
+
+// writeOutBits will write bits to the buffer.
+func (w *huffmanBitWriter) writeOutBits() {
+	bits := w.bits
+	w.bits >>= 48
+	w.nbits -= 48
+	n := w.nbytes
+
+	// We over-write, but faster...
+	binary.LittleEndian.PutUint64(w.bytes[n:], bits)
+	n += 6
+
+	if n >= bufferFlushSize {
+		if w.err != nil {
+			n = 0
+			return
+		}
+		w.write(w.bytes[:n])
+		n = 0
+	}
+
+	w.nbytes = n
+}
+
+// Write the header of a dynamic Huffman block to the output stream.
+//
+//  numLiterals  The number of literals specified in codegen
+//  numOffsets   The number of offsets specified in codegen
+//  numCodegens  The number of codegens used in codegen
+func (w *huffmanBitWriter) writeDynamicHeader(numLiterals int, numOffsets int, numCodegens int, isEof bool) {
+	if w.err != nil {
+		return
+	}
+	var firstBits int32 = 4
+	if isEof {
+		firstBits = 5
+	}
+	w.writeBits(firstBits, 3)
+	w.writeBits(int32(numLiterals-257), 5)
+	w.writeBits(int32(numOffsets-1), 5)
+	w.writeBits(int32(numCodegens-4), 4)
+
+	for i := 0; i < numCodegens; i++ {
+		value := uint(w.codegenEncoding.codes[codegenOrder[i]].len())
+		w.writeBits(int32(value), 3)
+	}
+
+	i := 0
+	for {
+		var codeWord = uint32(w.codegen[i])
+		i++
+		if codeWord == badCode {
+			break
+		}
+		w.writeCode(w.codegenEncoding.codes[codeWord])
+
+		switch codeWord {
+		case 16:
+			w.writeBits(int32(w.codegen[i]), 2)
+			i++
+		case 17:
+			w.writeBits(int32(w.codegen[i]), 3)
+			i++
+		case 18:
+			w.writeBits(int32(w.codegen[i]), 7)
+			i++
+		}
+	}
+}
+
+// writeStoredHeader will write a stored header.
+// If the stored block is only used for EOF,
+// it is replaced with a fixed huffman block.
+func (w *huffmanBitWriter) writeStoredHeader(length int, isEof bool) {
+	if w.err != nil {
+		return
+	}
+	if w.lastHeader > 0 {
+		// We owe an EOB
+		w.writeCode(w.literalEncoding.codes[endBlockMarker])
+		w.lastHeader = 0
+	}
+
+	// To write EOF, use a fixed encoding block. 10 bits instead of 5 bytes.
+	if length == 0 && isEof {
+		w.writeFixedHeader(isEof)
+		// EOB: 7 bits, value: 0
+		w.writeBits(0, 7)
+		w.flush()
+		return
+	}
+
+	var flag int32
+	if isEof {
+		flag = 1
+	}
+	w.writeBits(flag, 3)
+	w.flush()
+	w.writeBits(int32(length), 16)
+	w.writeBits(int32(^uint16(length)), 16)
+}
+
+func (w *huffmanBitWriter) writeFixedHeader(isEof bool) {
+	if w.err != nil {
+		return
+	}
+	if w.lastHeader > 0 {
+		// We owe an EOB
+		w.writeCode(w.literalEncoding.codes[endBlockMarker])
+		w.lastHeader = 0
+	}
+
+	// Indicate that we are a fixed Huffman block
+	var value int32 = 2
+	if isEof {
+		value = 3
+	}
+	w.writeBits(value, 3)
+}
+
+// writeBlock will write a block of tokens with the smallest encoding.
+// The original input can be supplied, and if the huffman encoded data
+// is larger than the original bytes, the data will be written as a
+// stored block.
+// If the input is nil, the tokens will always be Huffman encoded.
+func (w *huffmanBitWriter) writeBlock(tokens *tokens, eof bool, input []byte) {
+	if w.err != nil {
+		return
+	}
+
+	tokens.AddEOB()
+	if w.lastHeader > 0 {
+		// We owe an EOB
+		w.writeCode(w.literalEncoding.codes[endBlockMarker])
+		w.lastHeader = 0
+	}
+	numLiterals, numOffsets := w.indexTokens(tokens, false)
+	w.generate()
+	var extraBits int
+	storedSize, storable := w.storedSize(input)
+	if storable {
+		extraBits = w.extraBitSize()
+	}
+
+	// Figure out smallest code.
+	// Fixed Huffman baseline.
+	var literalEncoding = fixedLiteralEncoding
+	var offsetEncoding = fixedOffsetEncoding
+	var size = math.MaxInt32
+	if tokens.n < maxPredefinedTokens {
+		size = w.fixedSize(extraBits)
+	}
+
+	// Dynamic Huffman?
+	var numCodegens int
+
+	// Generate codegen and codegenFrequencies, which indicates how to encode
+	// the literalEncoding and the offsetEncoding.
+	w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, w.offsetEncoding)
+	w.codegenEncoding.generate(w.codegenFreq[:], 7)
+	dynamicSize, numCodegens := w.dynamicSize(w.literalEncoding, w.offsetEncoding, extraBits)
+
+	if dynamicSize < size {
+		size = dynamicSize
+		literalEncoding = w.literalEncoding
+		offsetEncoding = w.offsetEncoding
+	}
+
+	// Stored bytes?
+	if storable && storedSize <= size {
+		w.writeStoredHeader(len(input), eof)
+		w.writeBytes(input)
+		return
+	}
+
+	// Huffman.
+	if literalEncoding == fixedLiteralEncoding {
+		w.writeFixedHeader(eof)
+	} else {
+		w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof)
+	}
+
+	// Write the tokens.
+	w.writeTokens(tokens.Slice(), literalEncoding.codes, offsetEncoding.codes)
+}
+
+// writeBlockDynamic encodes a block using a dynamic Huffman table.
+// This should be used if the symbols used have a disproportionate
+// histogram distribution.
+// If input is supplied and the compression savings are below 1/16th of the
+// input size the block is stored.
+func (w *huffmanBitWriter) writeBlockDynamic(tokens *tokens, eof bool, input []byte, sync bool) {
+	if w.err != nil {
+		return
+	}
+
+	sync = sync || eof
+	if sync {
+		tokens.AddEOB()
+	}
+
+	// We cannot reuse pure huffman table, and must mark as EOF.
+	if (w.lastHuffMan || eof) && w.lastHeader > 0 {
+		// We will not try to reuse.
+		w.writeCode(w.literalEncoding.codes[endBlockMarker])
+		w.lastHeader = 0
+		w.lastHuffMan = false
+	}
+
+	// fillReuse enables filling of empty values.
+	// This will make encodings always reusable without testing.
+	// However, this does not appear to benefit on most cases.
+	const fillReuse = false
+
+	// Check if we can reuse...
+	if !fillReuse && w.lastHeader > 0 && !w.canReuse(tokens) {
+		w.writeCode(w.literalEncoding.codes[endBlockMarker])
+		w.lastHeader = 0
+	}
+
+	numLiterals, numOffsets := w.indexTokens(tokens, !sync)
+	extraBits := 0
+	ssize, storable := w.storedSize(input)
+
+	const usePrefs = true
+	if storable || w.lastHeader > 0 {
+		extraBits = w.extraBitSize()
+	}
+
+	var size int
+
+	// Check if we should reuse.
+	if w.lastHeader > 0 {
+		// Estimate size for using a new table.
+		// Use the previous header size as the best estimate.
+		newSize := w.lastHeader + tokens.EstimatedBits()
+		newSize += int(w.literalEncoding.codes[endBlockMarker].len()) + newSize>>w.logNewTablePenalty
+
+		// The estimated size is calculated as an optimal table.
+		// We add a penalty to make it more realistic and re-use a bit more.
+		reuseSize := w.dynamicReuseSize(w.literalEncoding, w.offsetEncoding) + extraBits
+
+		// Check if a new table is better.
+		if newSize < reuseSize {
+			// Write the EOB we owe.
+			w.writeCode(w.literalEncoding.codes[endBlockMarker])
+			size = newSize
+			w.lastHeader = 0
+		} else {
+			size = reuseSize
+		}
+
+		if tokens.n < maxPredefinedTokens {
+			if preSize := w.fixedSize(extraBits) + 7; usePrefs && preSize < size {
+				// Check if we get a reasonable size decrease.
+				if storable && ssize <= size {
+					w.writeStoredHeader(len(input), eof)
+					w.writeBytes(input)
+					return
+				}
+				w.writeFixedHeader(eof)
+				if !sync {
+					tokens.AddEOB()
+				}
+				w.writeTokens(tokens.Slice(), fixedLiteralEncoding.codes, fixedOffsetEncoding.codes)
+				return
+			}
+		}
+		// Check if we get a reasonable size decrease.
+		if storable && ssize <= size {
+			w.writeStoredHeader(len(input), eof)
+			w.writeBytes(input)
+			return
+		}
+	}
+
+	// We want a new block/table
+	if w.lastHeader == 0 {
+		if fillReuse && !sync {
+			w.fillTokens()
+			numLiterals, numOffsets = maxNumLit, maxNumDist
+		} else {
+			w.literalFreq[endBlockMarker] = 1
+		}
+
+		w.generate()
+		// Generate codegen and codegenFrequencies, which indicates how to encode
+		// the literalEncoding and the offsetEncoding.
+		w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, w.offsetEncoding)
+		w.codegenEncoding.generate(w.codegenFreq[:], 7)
+
+		var numCodegens int
+		if fillReuse && !sync {
+			// Reindex for accurate size...
+			w.indexTokens(tokens, true)
+		}
+		size, numCodegens = w.dynamicSize(w.literalEncoding, w.offsetEncoding, extraBits)
+
+		// Store predefined, if we don't get a reasonable improvement.
+		if tokens.n < maxPredefinedTokens {
+			if preSize := w.fixedSize(extraBits); usePrefs && preSize <= size {
+				// Store bytes, if we don't get an improvement.
+				if storable && ssize <= preSize {
+					w.writeStoredHeader(len(input), eof)
+					w.writeBytes(input)
+					return
+				}
+				w.writeFixedHeader(eof)
+				if !sync {
+					tokens.AddEOB()
+				}
+				w.writeTokens(tokens.Slice(), fixedLiteralEncoding.codes, fixedOffsetEncoding.codes)
+				return
+			}
+		}
+
+		if storable && ssize <= size {
+			// Store bytes, if we don't get an improvement.
+			w.writeStoredHeader(len(input), eof)
+			w.writeBytes(input)
+			return
+		}
+
+		// Write Huffman table.
+		w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof)
+		if !sync {
+			w.lastHeader, _ = w.headerSize()
+		}
+		w.lastHuffMan = false
+	}
+
+	if sync {
+		w.lastHeader = 0
+	}
+	// Write the tokens.
+	w.writeTokens(tokens.Slice(), w.literalEncoding.codes, w.offsetEncoding.codes)
+}
+
+func (w *huffmanBitWriter) fillTokens() {
+	for i, v := range w.literalFreq[:literalCount] {
+		if v == 0 {
+			w.literalFreq[i] = 1
+		}
+	}
+	for i, v := range w.offsetFreq[:offsetCodeCount] {
+		if v == 0 {
+			w.offsetFreq[i] = 1
+		}
+	}
+}
+
+// indexTokens indexes a slice of tokens, and updates
+// literalFreq and offsetFreq, and generates literalEncoding
+// and offsetEncoding.
+// The number of literal and offset tokens is returned.
+func (w *huffmanBitWriter) indexTokens(t *tokens, filled bool) (numLiterals, numOffsets int) {
+	copy(w.literalFreq[:], t.litHist[:])
+	copy(w.literalFreq[256:], t.extraHist[:])
+	copy(w.offsetFreq[:], t.offHist[:offsetCodeCount])
+
+	if t.n == 0 {
+		return
+	}
+	if filled {
+		return maxNumLit, maxNumDist
+	}
+	// get the number of literals
+	numLiterals = len(w.literalFreq)
+	for w.literalFreq[numLiterals-1] == 0 {
+		numLiterals--
+	}
+	// get the number of offsets
+	numOffsets = len(w.offsetFreq)
+	for numOffsets > 0 && w.offsetFreq[numOffsets-1] == 0 {
+		numOffsets--
+	}
+	if numOffsets == 0 {
+		// We haven't found a single match. If we want to go with the dynamic encoding,
+		// we should count at least one offset to be sure that the offset huffman tree could be encoded.
+		w.offsetFreq[0] = 1
+		numOffsets = 1
+	}
+	return
+}
+
+func (w *huffmanBitWriter) generate() {
+	w.literalEncoding.generate(w.literalFreq[:literalCount], 15)
+	w.offsetEncoding.generate(w.offsetFreq[:offsetCodeCount], 15)
+}
+
+// writeTokens writes a slice of tokens to the output.
+// codes for literal and offset encoding must be supplied.
+func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode) {
+	if w.err != nil {
+		return
+	}
+	if len(tokens) == 0 {
+		return
+	}
+
+	// Only last token should be endBlockMarker.
+	var deferEOB bool
+	if tokens[len(tokens)-1] == endBlockMarker {
+		tokens = tokens[:len(tokens)-1]
+		deferEOB = true
+	}
+
+	// Create slices up to the next power of two to avoid bounds checks.
+	lits := leCodes[:256]
+	offs := oeCodes[:32]
+	lengths := leCodes[lengthCodesStart:]
+	lengths = lengths[:32]
+
+	// Go 1.16 LOVES having these on stack.
+	bits, nbits, nbytes := w.bits, w.nbits, w.nbytes
+
+	for _, t := range tokens {
+		if t < 256 {
+			//w.writeCode(lits[t.literal()])
+			c := lits[t]
+			bits |= c.code64() << (nbits & 63)
+			nbits += c.len()
+			if nbits >= 48 {
+				binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)
+				//*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits
+				bits >>= 48
+				nbits -= 48
+				nbytes += 6
+				if nbytes >= bufferFlushSize {
+					if w.err != nil {
+						nbytes = 0
+						return
+					}
+					_, w.err = w.writer.Write(w.bytes[:nbytes])
+					nbytes = 0
+				}
+			}
+			continue
+		}
+
+		// Write the length
+		length := t.length()
+		lengthCode := lengthCode(length) & 31
+		if false {
+			w.writeCode(lengths[lengthCode])
+		} else {
+			// inlined
+			c := lengths[lengthCode]
+			bits |= c.code64() << (nbits & 63)
+			nbits += c.len()
+			if nbits >= 48 {
+				binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)
+				//*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits
+				bits >>= 48
+				nbits -= 48
+				nbytes += 6
+				if nbytes >= bufferFlushSize {
+					if w.err != nil {
+						nbytes = 0
+						return
+					}
+					_, w.err = w.writer.Write(w.bytes[:nbytes])
+					nbytes = 0
+				}
+			}
+		}
+
+		if lengthCode >= lengthExtraBitsMinCode {
+			extraLengthBits := lengthExtraBits[lengthCode]
+			//w.writeBits(extraLength, extraLengthBits)
+			extraLength := int32(length - lengthBase[lengthCode])
+			bits |= uint64(extraLength) << (nbits & 63)
+			nbits += extraLengthBits
+			if nbits >= 48 {
+				binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)
+				//*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits
+				bits >>= 48
+				nbits -= 48
+				nbytes += 6
+				if nbytes >= bufferFlushSize {
+					if w.err != nil {
+						nbytes = 0
+						return
+					}
+					_, w.err = w.writer.Write(w.bytes[:nbytes])
+					nbytes = 0
+				}
+			}
+		}
+		// Write the offset
+		offset := t.offset()
+		offsetCode := (offset >> 16) & 31
+		if false {
+			w.writeCode(offs[offsetCode])
+		} else {
+			// inlined
+			c := offs[offsetCode]
+			bits |= c.code64() << (nbits & 63)
+			nbits += c.len()
+			if nbits >= 48 {
+				binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)
+				//*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits
+				bits >>= 48
+				nbits -= 48
+				nbytes += 6
+				if nbytes >= bufferFlushSize {
+					if w.err != nil {
+						nbytes = 0
+						return
+					}
+					_, w.err = w.writer.Write(w.bytes[:nbytes])
+					nbytes = 0
+				}
+			}
+		}
+
+		if offsetCode >= offsetExtraBitsMinCode {
+			offsetComb := offsetCombined[offsetCode]
+			//w.writeBits(extraOffset, extraOffsetBits)
+			bits |= uint64((offset-(offsetComb>>8))&matchOffsetOnlyMask) << (nbits & 63)
+			nbits += uint8(offsetComb)
+			if nbits >= 48 {
+				binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)
+				//*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits
+				bits >>= 48
+				nbits -= 48
+				nbytes += 6
+				if nbytes >= bufferFlushSize {
+					if w.err != nil {
+						nbytes = 0
+						return
+					}
+					_, w.err = w.writer.Write(w.bytes[:nbytes])
+					nbytes = 0
+				}
+			}
+		}
+	}
+	// Restore...
+	w.bits, w.nbits, w.nbytes = bits, nbits, nbytes
+
+	if deferEOB {
+		w.writeCode(leCodes[endBlockMarker])
+	}
+}
+
+// huffOffset is a static offset encoder used for huffman only encoding.
+// It can be reused since we will not be encoding offset values.
+var huffOffset *huffmanEncoder
+
+func init() {
+	w := newHuffmanBitWriter(nil)
+	w.offsetFreq[0] = 1
+	huffOffset = newHuffmanEncoder(offsetCodeCount)
+	huffOffset.generate(w.offsetFreq[:offsetCodeCount], 15)
+}
+
+// writeBlockHuff encodes a block of bytes as either
+// Huffman encoded literals or uncompressed bytes if the
+// results only gains very little from compression.
+func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) {
+	if w.err != nil {
+		return
+	}
+
+	// Clear histogram
+	for i := range w.literalFreq[:] {
+		w.literalFreq[i] = 0
+	}
+	if !w.lastHuffMan {
+		for i := range w.offsetFreq[:] {
+			w.offsetFreq[i] = 0
+		}
+	}
+
+	const numLiterals = endBlockMarker + 1
+	const numOffsets = 1
+
+	// Add everything as literals
+	// We have to estimate the header size.
+	// Assume header is around 70 bytes:
+	// https://stackoverflow.com/a/25454430
+	const guessHeaderSizeBits = 70 * 8
+	histogram(input, w.literalFreq[:numLiterals])
+	ssize, storable := w.storedSize(input)
+	if storable && len(input) > 1024 {
+		// Quick check for incompressible content.
+		abs := float64(0)
+		avg := float64(len(input)) / 256
+		max := float64(len(input) * 2)
+		for _, v := range w.literalFreq[:256] {
+			diff := float64(v) - avg
+			abs += diff * diff
+			if abs > max {
+				break
+			}
+		}
+		if abs < max {
+			if debugDeflate {
+				fmt.Println("stored", abs, "<", max)
+			}
+			// No chance we can compress this...
+			w.writeStoredHeader(len(input), eof)
+			w.writeBytes(input)
+			return
+		}
+	}
+	w.literalFreq[endBlockMarker] = 1
+	w.tmpLitEncoding.generate(w.literalFreq[:numLiterals], 15)
+	estBits := w.tmpLitEncoding.canReuseBits(w.literalFreq[:numLiterals])
+	if estBits < math.MaxInt32 {
+		estBits += w.lastHeader
+		if w.lastHeader == 0 {
+			estBits += guessHeaderSizeBits
+		}
+		estBits += estBits >> w.logNewTablePenalty
+	}
+
+	// Store bytes, if we don't get a reasonable improvement.
+	if storable && ssize <= estBits {
+		if debugDeflate {
+			fmt.Println("stored,", ssize, "<=", estBits)
+		}
+		w.writeStoredHeader(len(input), eof)
+		w.writeBytes(input)
+		return
+	}
+
+	if w.lastHeader > 0 {
+		reuseSize := w.literalEncoding.canReuseBits(w.literalFreq[:256])
+
+		if estBits < reuseSize {
+			if debugDeflate {
+				fmt.Println("NOT reusing, reuse:", reuseSize/8, "> new:", estBits/8, "header est:", w.lastHeader/8, "bytes")
+			}
+			// We owe an EOB
+			w.writeCode(w.literalEncoding.codes[endBlockMarker])
+			w.lastHeader = 0
+		} else if debugDeflate {
+			fmt.Println("reusing, reuse:", reuseSize/8, "> new:", estBits/8, "- header est:", w.lastHeader/8)
+		}
+	}
+
+	count := 0
+	if w.lastHeader == 0 {
+		// Use the temp encoding, so swap.
+		w.literalEncoding, w.tmpLitEncoding = w.tmpLitEncoding, w.literalEncoding
+		// Generate codegen and codegenFrequencies, which indicates how to encode
+		// the literalEncoding and the offsetEncoding.
+		w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, huffOffset)
+		w.codegenEncoding.generate(w.codegenFreq[:], 7)
+		numCodegens := w.codegens()
+
+		// Huffman.
+		w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof)
+		w.lastHuffMan = true
+		w.lastHeader, _ = w.headerSize()
+		if debugDeflate {
+			count += w.lastHeader
+			fmt.Println("header:", count/8)
+		}
+	}
+
+	encoding := w.literalEncoding.codes[:256]
+	// Go 1.16 LOVES having these on stack. At least 1.5x the speed.
+	bits, nbits, nbytes := w.bits, w.nbits, w.nbytes
+
+	if debugDeflate {
+		count -= int(nbytes)*8 + int(nbits)
+	}
+	// Unroll, write 3 codes/loop.
+	// Fastest number of unrolls.
+	for len(input) > 3 {
+		// We must have at least 48 bits free.
+		if nbits >= 8 {
+			n := nbits >> 3
+			binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)
+			bits >>= (n * 8) & 63
+			nbits -= n * 8
+			nbytes += n
+		}
+		if nbytes >= bufferFlushSize {
+			if w.err != nil {
+				nbytes = 0
+				return
+			}
+			if debugDeflate {
+				count += int(nbytes) * 8
+			}
+			_, w.err = w.writer.Write(w.bytes[:nbytes])
+			nbytes = 0
+		}
+		a, b := encoding[input[0]], encoding[input[1]]
+		bits |= a.code64() << (nbits & 63)
+		bits |= b.code64() << ((nbits + a.len()) & 63)
+		c := encoding[input[2]]
+		nbits += b.len() + a.len()
+		bits |= c.code64() << (nbits & 63)
+		nbits += c.len()
+		input = input[3:]
+	}
+
+	// Remaining...
+	for _, t := range input {
+		if nbits >= 48 {
+			binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)
+			//*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits
+			bits >>= 48
+			nbits -= 48
+			nbytes += 6
+			if nbytes >= bufferFlushSize {
+				if w.err != nil {
+					nbytes = 0
+					return
+				}
+				if debugDeflate {
+					count += int(nbytes) * 8
+				}
+				_, w.err = w.writer.Write(w.bytes[:nbytes])
+				nbytes = 0
+			}
+		}
+		// Bitwriting inlined, ~30% speedup
+		c := encoding[t]
+		bits |= c.code64() << (nbits & 63)
+
+		nbits += c.len()
+		if debugDeflate {
+			count += int(c.len())
+		}
+	}
+	// Restore...
+	w.bits, w.nbits, w.nbytes = bits, nbits, nbytes
+
+	if debugDeflate {
+		nb := count + int(nbytes)*8 + int(nbits)
+		fmt.Println("wrote", nb, "bits,", nb/8, "bytes.")
+	}
+	// Flush if needed to have space.
+	if w.nbits >= 48 {
+		w.writeOutBits()
+	}
+
+	if eof || sync {
+		w.writeCode(w.literalEncoding.codes[endBlockMarker])
+		w.lastHeader = 0
+		w.lastHuffMan = false
+	}
+}
diff --git a/vendor/github.com/klauspost/compress/flate/huffman_code.go b/vendor/github.com/klauspost/compress/flate/huffman_code.go
new file mode 100644
index 000000000..5ac144f28
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/huffman_code.go
@@ -0,0 +1,412 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package flate
+
+import (
+	"math"
+	"math/bits"
+)
+
+const (
+	maxBitsLimit = 16
+	// number of valid literals
+	literalCount = 286
+)
+
+// hcode is a huffman code with a bit code and bit length.
+type hcode uint32
+
+func (h hcode) len() uint8 {
+	return uint8(h)
+}
+
+func (h hcode) code64() uint64 {
+	return uint64(h >> 8)
+}
+
+func (h hcode) zero() bool {
+	return h == 0
+}
+
+type huffmanEncoder struct {
+	codes    []hcode
+	bitCount [17]int32
+
+	// Allocate a reusable buffer with the longest possible frequency table.
+	// Possible lengths are codegenCodeCount, offsetCodeCount and literalCount.
+	// The largest of these is literalCount, so we allocate for that case.
+	freqcache [literalCount + 1]literalNode
+}
+
+type literalNode struct {
+	literal uint16
+	freq    uint16
+}
+
+// A levelInfo describes the state of the constructed tree for a given depth.
+type levelInfo struct {
+	// Our level.  for better printing
+	level int32
+
+	// The frequency of the last node at this level
+	lastFreq int32
+
+	// The frequency of the next character to add to this level
+	nextCharFreq int32
+
+	// The frequency of the next pair (from level below) to add to this level.
+	// Only valid if the "needed" value of the next lower level is 0.
+	nextPairFreq int32
+
+	// The number of chains remaining to generate for this level before moving
+	// up to the next level
+	needed int32
+}
+
+// set sets the code and length of an hcode.
+func (h *hcode) set(code uint16, length uint8) {
+	*h = hcode(length) | (hcode(code) << 8)
+}
+
+func newhcode(code uint16, length uint8) hcode {
+	return hcode(length) | (hcode(code) << 8)
+}
+
+func reverseBits(number uint16, bitLength byte) uint16 {
+	return bits.Reverse16(number << ((16 - bitLength) & 15))
+}
+
+func maxNode() literalNode { return literalNode{math.MaxUint16, math.MaxUint16} }
+
+func newHuffmanEncoder(size int) *huffmanEncoder {
+	// Make capacity to next power of two.
+	c := uint(bits.Len32(uint32(size - 1)))
+	return &huffmanEncoder{codes: make([]hcode, size, 1<<c)}
+}
+
+// Generates a HuffmanCode corresponding to the fixed literal table
+func generateFixedLiteralEncoding() *huffmanEncoder {
+	h := newHuffmanEncoder(literalCount)
+	codes := h.codes
+	var ch uint16
+	for ch = 0; ch < literalCount; ch++ {
+		var bits uint16
+		var size uint8
+		switch {
+		case ch < 144:
+			// size 8, 000110000  .. 10111111
+			bits = ch + 48
+			size = 8
+		case ch < 256:
+			// size 9, 110010000 .. 111111111
+			bits = ch + 400 - 144
+			size = 9
+		case ch < 280:
+			// size 7, 0000000 .. 0010111
+			bits = ch - 256
+			size = 7
+		default:
+			// size 8, 11000000 .. 11000111
+			bits = ch + 192 - 280
+			size = 8
+		}
+		codes[ch] = newhcode(reverseBits(bits, size), size)
+	}
+	return h
+}
+
+func generateFixedOffsetEncoding() *huffmanEncoder {
+	h := newHuffmanEncoder(30)
+	codes := h.codes
+	for ch := range codes {
+		codes[ch] = newhcode(reverseBits(uint16(ch), 5), 5)
+	}
+	return h
+}
+
+var fixedLiteralEncoding = generateFixedLiteralEncoding()
+var fixedOffsetEncoding = generateFixedOffsetEncoding()
+
+func (h *huffmanEncoder) bitLength(freq []uint16) int {
+	var total int
+	for i, f := range freq {
+		if f != 0 {
+			total += int(f) * int(h.codes[i].len())
+		}
+	}
+	return total
+}
+
+func (h *huffmanEncoder) bitLengthRaw(b []byte) int {
+	var total int
+	for _, f := range b {
+		total += int(h.codes[f].len())
+	}
+	return total
+}
+
+// canReuseBits returns the number of bits or math.MaxInt32 if the encoder cannot be reused.
+func (h *huffmanEncoder) canReuseBits(freq []uint16) int {
+	var total int
+	for i, f := range freq {
+		if f != 0 {
+			code := h.codes[i]
+			if code.zero() {
+				return math.MaxInt32
+			}
+			total += int(f) * int(code.len())
+		}
+	}
+	return total
+}
+
+// Return the number of literals assigned to each bit size in the Huffman encoding
+//
+// This method is only called when list.length >= 3
+// The cases of 0, 1, and 2 literals are handled by special case code.
+//
+// list  An array of the literals with non-zero frequencies
+//             and their associated frequencies. The array is in order of increasing
+//             frequency, and has as its last element a special element with frequency
+//             MaxInt32
+// maxBits     The maximum number of bits that should be used to encode any literal.
+//             Must be less than 16.
+// return      An integer array in which array[i] indicates the number of literals
+//             that should be encoded in i bits.
+func (h *huffmanEncoder) bitCounts(list []literalNode, maxBits int32) []int32 {
+	if maxBits >= maxBitsLimit {
+		panic("flate: maxBits too large")
+	}
+	n := int32(len(list))
+	list = list[0 : n+1]
+	list[n] = maxNode()
+
+	// The tree can't have greater depth than n - 1, no matter what. This
+	// saves a little bit of work in some small cases
+	if maxBits > n-1 {
+		maxBits = n - 1
+	}
+
+	// Create information about each of the levels.
+	// A bogus "Level 0" whose sole purpose is so that
+	// level1.prev.needed==0.  This makes level1.nextPairFreq
+	// be a legitimate value that never gets chosen.
+	var levels [maxBitsLimit]levelInfo
+	// leafCounts[i] counts the number of literals at the left
+	// of ancestors of the rightmost node at level i.
+	// leafCounts[i][j] is the number of literals at the left
+	// of the level j ancestor.
+	var leafCounts [maxBitsLimit][maxBitsLimit]int32
+
+	// Descending to only have 1 bounds check.
+	l2f := int32(list[2].freq)
+	l1f := int32(list[1].freq)
+	l0f := int32(list[0].freq) + int32(list[1].freq)
+
+	for level := int32(1); level <= maxBits; level++ {
+		// For every level, the first two items are the first two characters.
+		// We initialize the levels as if we had already figured this out.
+		levels[level] = levelInfo{
+			level:        level,
+			lastFreq:     l1f,
+			nextCharFreq: l2f,
+			nextPairFreq: l0f,
+		}
+		leafCounts[level][level] = 2
+		if level == 1 {
+			levels[level].nextPairFreq = math.MaxInt32
+		}
+	}
+
+	// We need a total of 2*n - 2 items at top level and have already generated 2.
+	levels[maxBits].needed = 2*n - 4
+
+	level := uint32(maxBits)
+	for level < 16 {
+		l := &levels[level]
+		if l.nextPairFreq == math.MaxInt32 && l.nextCharFreq == math.MaxInt32 {
+			// We've run out of both leafs and pairs.
+			// End all calculations for this level.
+			// To make sure we never come back to this level or any lower level,
+			// set nextPairFreq impossibly large.
+			l.needed = 0
+			levels[level+1].nextPairFreq = math.MaxInt32
+			level++
+			continue
+		}
+
+		prevFreq := l.lastFreq
+		if l.nextCharFreq < l.nextPairFreq {
+			// The next item on this row is a leaf node.
+			n := leafCounts[level][level] + 1
+			l.lastFreq = l.nextCharFreq
+			// Lower leafCounts are the same of the previous node.
+			leafCounts[level][level] = n
+			e := list[n]
+			if e.literal < math.MaxUint16 {
+				l.nextCharFreq = int32(e.freq)
+			} else {
+				l.nextCharFreq = math.MaxInt32
+			}
+		} else {
+			// The next item on this row is a pair from the previous row.
+			// nextPairFreq isn't valid until we generate two
+			// more values in the level below
+			l.lastFreq = l.nextPairFreq
+			// Take leaf counts from the lower level, except counts[level] remains the same.
+			if true {
+				save := leafCounts[level][level]
+				leafCounts[level] = leafCounts[level-1]
+				leafCounts[level][level] = save
+			} else {
+				copy(leafCounts[level][:level], leafCounts[level-1][:level])
+			}
+			levels[l.level-1].needed = 2
+		}
+
+		if l.needed--; l.needed == 0 {
+			// We've done everything we need to do for this level.
+			// Continue calculating one level up. Fill in nextPairFreq
+			// of that level with the sum of the two nodes we've just calculated on
+			// this level.
+			if l.level == maxBits {
+				// All done!
+				break
+			}
+			levels[l.level+1].nextPairFreq = prevFreq + l.lastFreq
+			level++
+		} else {
+			// If we stole from below, move down temporarily to replenish it.
+			for levels[level-1].needed > 0 {
+				level--
+			}
+		}
+	}
+
+	// Somethings is wrong if at the end, the top level is null or hasn't used
+	// all of the leaves.
+	if leafCounts[maxBits][maxBits] != n {
+		panic("leafCounts[maxBits][maxBits] != n")
+	}
+
+	bitCount := h.bitCount[:maxBits+1]
+	bits := 1
+	counts := &leafCounts[maxBits]
+	for level := maxBits; level > 0; level-- {
+		// chain.leafCount gives the number of literals requiring at least "bits"
+		// bits to encode.
+		bitCount[bits] = counts[level] - counts[level-1]
+		bits++
+	}
+	return bitCount
+}
+
+// Look at the leaves and assign them a bit count and an encoding as specified
+// in RFC 1951 3.2.2
+func (h *huffmanEncoder) assignEncodingAndSize(bitCount []int32, list []literalNode) {
+	code := uint16(0)
+	for n, bits := range bitCount {
+		code <<= 1
+		if n == 0 || bits == 0 {
+			continue
+		}
+		// The literals list[len(list)-bits] .. list[len(list)-bits]
+		// are encoded using "bits" bits, and get the values
+		// code, code + 1, ....  The code values are
+		// assigned in literal order (not frequency order).
+		chunk := list[len(list)-int(bits):]
+
+		sortByLiteral(chunk)
+		for _, node := range chunk {
+			h.codes[node.literal] = newhcode(reverseBits(code, uint8(n)), uint8(n))
+			code++
+		}
+		list = list[0 : len(list)-int(bits)]
+	}
+}
+
+// Update this Huffman Code object to be the minimum code for the specified frequency count.
+//
+// freq  An array of frequencies, in which frequency[i] gives the frequency of literal i.
+// maxBits  The maximum number of bits to use for any literal.
+func (h *huffmanEncoder) generate(freq []uint16, maxBits int32) {
+	list := h.freqcache[:len(freq)+1]
+	codes := h.codes[:len(freq)]
+	// Number of non-zero literals
+	count := 0
+	// Set list to be the set of all non-zero literals and their frequencies
+	for i, f := range freq {
+		if f != 0 {
+			list[count] = literalNode{uint16(i), f}
+			count++
+		} else {
+			codes[i] = 0
+		}
+	}
+	list[count] = literalNode{}
+
+	list = list[:count]
+	if count <= 2 {
+		// Handle the small cases here, because they are awkward for the general case code. With
+		// two or fewer literals, everything has bit length 1.
+		for i, node := range list {
+			// "list" is in order of increasing literal value.
+			h.codes[node.literal].set(uint16(i), 1)
+		}
+		return
+	}
+	sortByFreq(list)
+
+	// Get the number of literals for each bit count
+	bitCount := h.bitCounts(list, maxBits)
+	// And do the assignment
+	h.assignEncodingAndSize(bitCount, list)
+}
+
+// atLeastOne clamps the result between 1 and 15.
+func atLeastOne(v float32) float32 {
+	if v < 1 {
+		return 1
+	}
+	if v > 15 {
+		return 15
+	}
+	return v
+}
+
+func histogram(b []byte, h []uint16) {
+	if true && len(b) >= 8<<10 {
+		// Split for bigger inputs
+		histogramSplit(b, h)
+	} else {
+		h = h[:256]
+		for _, t := range b {
+			h[t]++
+		}
+	}
+}
+
+func histogramSplit(b []byte, h []uint16) {
+	// Tested, and slightly faster than 2-way.
+	// Writing to separate arrays and combining is also slightly slower.
+	h = h[:256]
+	for len(b)&3 != 0 {
+		h[b[0]]++
+		b = b[1:]
+	}
+	n := len(b) / 4
+	x, y, z, w := b[:n], b[n:], b[n+n:], b[n+n+n:]
+	y, z, w = y[:len(x)], z[:len(x)], w[:len(x)]
+	for i, t := range x {
+		v0 := &h[t]
+		v1 := &h[y[i]]
+		v3 := &h[w[i]]
+		v2 := &h[z[i]]
+		*v0++
+		*v1++
+		*v2++
+		*v3++
+	}
+}
diff --git a/vendor/github.com/klauspost/compress/flate/huffman_sortByFreq.go b/vendor/github.com/klauspost/compress/flate/huffman_sortByFreq.go
new file mode 100644
index 000000000..207780299
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/huffman_sortByFreq.go
@@ -0,0 +1,178 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package flate
+
+// Sort sorts data.
+// It makes one call to data.Len to determine n, and O(n*log(n)) calls to
+// data.Less and data.Swap. The sort is not guaranteed to be stable.
+func sortByFreq(data []literalNode) {
+	n := len(data)
+	quickSortByFreq(data, 0, n, maxDepth(n))
+}
+
+func quickSortByFreq(data []literalNode, a, b, maxDepth int) {
+	for b-a > 12 { // Use ShellSort for slices <= 12 elements
+		if maxDepth == 0 {
+			heapSort(data, a, b)
+			return
+		}
+		maxDepth--
+		mlo, mhi := doPivotByFreq(data, a, b)
+		// Avoiding recursion on the larger subproblem guarantees
+		// a stack depth of at most lg(b-a).
+		if mlo-a < b-mhi {
+			quickSortByFreq(data, a, mlo, maxDepth)
+			a = mhi // i.e., quickSortByFreq(data, mhi, b)
+		} else {
+			quickSortByFreq(data, mhi, b, maxDepth)
+			b = mlo // i.e., quickSortByFreq(data, a, mlo)
+		}
+	}
+	if b-a > 1 {
+		// Do ShellSort pass with gap 6
+		// It could be written in this simplified form cause b-a <= 12
+		for i := a + 6; i < b; i++ {
+			if data[i].freq == data[i-6].freq && data[i].literal < data[i-6].literal || data[i].freq < data[i-6].freq {
+				data[i], data[i-6] = data[i-6], data[i]
+			}
+		}
+		insertionSortByFreq(data, a, b)
+	}
+}
+
+// siftDownByFreq implements the heap property on data[lo, hi).
+// first is an offset into the array where the root of the heap lies.
+func siftDownByFreq(data []literalNode, lo, hi, first int) {
+	root := lo
+	for {
+		child := 2*root + 1
+		if child >= hi {
+			break
+		}
+		if child+1 < hi && (data[first+child].freq == data[first+child+1].freq && data[first+child].literal < data[first+child+1].literal || data[first+child].freq < data[first+child+1].freq) {
+			child++
+		}
+		if data[first+root].freq == data[first+child].freq && data[first+root].literal > data[first+child].literal || data[first+root].freq > data[first+child].freq {
+			return
+		}
+		data[first+root], data[first+child] = data[first+child], data[first+root]
+		root = child
+	}
+}
+func doPivotByFreq(data []literalNode, lo, hi int) (midlo, midhi int) {
+	m := int(uint(lo+hi) >> 1) // Written like this to avoid integer overflow.
+	if hi-lo > 40 {
+		// Tukey's ``Ninther,'' median of three medians of three.
+		s := (hi - lo) / 8
+		medianOfThreeSortByFreq(data, lo, lo+s, lo+2*s)
+		medianOfThreeSortByFreq(data, m, m-s, m+s)
+		medianOfThreeSortByFreq(data, hi-1, hi-1-s, hi-1-2*s)
+	}
+	medianOfThreeSortByFreq(data, lo, m, hi-1)
+
+	// Invariants are:
+	//	data[lo] = pivot (set up by ChoosePivot)
+	//	data[lo < i < a] < pivot
+	//	data[a <= i < b] <= pivot
+	//	data[b <= i < c] unexamined
+	//	data[c <= i < hi-1] > pivot
+	//	data[hi-1] >= pivot
+	pivot := lo
+	a, c := lo+1, hi-1
+
+	for ; a < c && (data[a].freq == data[pivot].freq && data[a].literal < data[pivot].literal || data[a].freq < data[pivot].freq); a++ {
+	}
+	b := a
+	for {
+		for ; b < c && (data[pivot].freq == data[b].freq && data[pivot].literal > data[b].literal || data[pivot].freq > data[b].freq); b++ { // data[b] <= pivot
+		}
+		for ; b < c && (data[pivot].freq == data[c-1].freq && data[pivot].literal < data[c-1].literal || data[pivot].freq < data[c-1].freq); c-- { // data[c-1] > pivot
+		}
+		if b >= c {
+			break
+		}
+		// data[b] > pivot; data[c-1] <= pivot
+		data[b], data[c-1] = data[c-1], data[b]
+		b++
+		c--
+	}
+	// If hi-c<3 then there are duplicates (by property of median of nine).
+	// Let's be a bit more conservative, and set border to 5.
+	protect := hi-c < 5
+	if !protect && hi-c < (hi-lo)/4 {
+		// Lets test some points for equality to pivot
+		dups := 0
+		if data[pivot].freq == data[hi-1].freq && data[pivot].literal > data[hi-1].literal || data[pivot].freq > data[hi-1].freq { // data[hi-1] = pivot
+			data[c], data[hi-1] = data[hi-1], data[c]
+			c++
+			dups++
+		}
+		if data[b-1].freq == data[pivot].freq && data[b-1].literal > data[pivot].literal || data[b-1].freq > data[pivot].freq { // data[b-1] = pivot
+			b--
+			dups++
+		}
+		// m-lo = (hi-lo)/2 > 6
+		// b-lo > (hi-lo)*3/4-1 > 8
+		// ==> m < b ==> data[m] <= pivot
+		if data[m].freq == data[pivot].freq && data[m].literal > data[pivot].literal || data[m].freq > data[pivot].freq { // data[m] = pivot
+			data[m], data[b-1] = data[b-1], data[m]
+			b--
+			dups++
+		}
+		// if at least 2 points are equal to pivot, assume skewed distribution
+		protect = dups > 1
+	}
+	if protect {
+		// Protect against a lot of duplicates
+		// Add invariant:
+		//	data[a <= i < b] unexamined
+		//	data[b <= i < c] = pivot
+		for {
+			for ; a < b && (data[b-1].freq == data[pivot].freq && data[b-1].literal > data[pivot].literal || data[b-1].freq > data[pivot].freq); b-- { // data[b] == pivot
+			}
+			for ; a < b && (data[a].freq == data[pivot].freq && data[a].literal < data[pivot].literal || data[a].freq < data[pivot].freq); a++ { // data[a] < pivot
+			}
+			if a >= b {
+				break
+			}
+			// data[a] == pivot; data[b-1] < pivot
+			data[a], data[b-1] = data[b-1], data[a]
+			a++
+			b--
+		}
+	}
+	// Swap pivot into middle
+	data[pivot], data[b-1] = data[b-1], data[pivot]
+	return b - 1, c
+}
+
+// Insertion sort
+func insertionSortByFreq(data []literalNode, a, b int) {
+	for i := a + 1; i < b; i++ {
+		for j := i; j > a && (data[j].freq == data[j-1].freq && data[j].literal < data[j-1].literal || data[j].freq < data[j-1].freq); j-- {
+			data[j], data[j-1] = data[j-1], data[j]
+		}
+	}
+}
+
+// quickSortByFreq, loosely following Bentley and McIlroy,
+// ``Engineering a Sort Function,'' SP&E November 1993.
+
+// medianOfThreeSortByFreq moves the median of the three values data[m0], data[m1], data[m2] into data[m1].
+func medianOfThreeSortByFreq(data []literalNode, m1, m0, m2 int) {
+	// sort 3 elements
+	if data[m1].freq == data[m0].freq && data[m1].literal < data[m0].literal || data[m1].freq < data[m0].freq {
+		data[m1], data[m0] = data[m0], data[m1]
+	}
+	// data[m0] <= data[m1]
+	if data[m2].freq == data[m1].freq && data[m2].literal < data[m1].literal || data[m2].freq < data[m1].freq {
+		data[m2], data[m1] = data[m1], data[m2]
+		// data[m0] <= data[m2] && data[m1] < data[m2]
+		if data[m1].freq == data[m0].freq && data[m1].literal < data[m0].literal || data[m1].freq < data[m0].freq {
+			data[m1], data[m0] = data[m0], data[m1]
+		}
+	}
+	// now data[m0] <= data[m1] <= data[m2]
+}
diff --git a/vendor/github.com/klauspost/compress/flate/huffman_sortByLiteral.go b/vendor/github.com/klauspost/compress/flate/huffman_sortByLiteral.go
new file mode 100644
index 000000000..93f1aea10
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/huffman_sortByLiteral.go
@@ -0,0 +1,201 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package flate
+
+// Sort sorts data.
+// It makes one call to data.Len to determine n, and O(n*log(n)) calls to
+// data.Less and data.Swap. The sort is not guaranteed to be stable.
+func sortByLiteral(data []literalNode) {
+	n := len(data)
+	quickSort(data, 0, n, maxDepth(n))
+}
+
+func quickSort(data []literalNode, a, b, maxDepth int) {
+	for b-a > 12 { // Use ShellSort for slices <= 12 elements
+		if maxDepth == 0 {
+			heapSort(data, a, b)
+			return
+		}
+		maxDepth--
+		mlo, mhi := doPivot(data, a, b)
+		// Avoiding recursion on the larger subproblem guarantees
+		// a stack depth of at most lg(b-a).
+		if mlo-a < b-mhi {
+			quickSort(data, a, mlo, maxDepth)
+			a = mhi // i.e., quickSort(data, mhi, b)
+		} else {
+			quickSort(data, mhi, b, maxDepth)
+			b = mlo // i.e., quickSort(data, a, mlo)
+		}
+	}
+	if b-a > 1 {
+		// Do ShellSort pass with gap 6
+		// It could be written in this simplified form cause b-a <= 12
+		for i := a + 6; i < b; i++ {
+			if data[i].literal < data[i-6].literal {
+				data[i], data[i-6] = data[i-6], data[i]
+			}
+		}
+		insertionSort(data, a, b)
+	}
+}
+func heapSort(data []literalNode, a, b int) {
+	first := a
+	lo := 0
+	hi := b - a
+
+	// Build heap with greatest element at top.
+	for i := (hi - 1) / 2; i >= 0; i-- {
+		siftDown(data, i, hi, first)
+	}
+
+	// Pop elements, largest first, into end of data.
+	for i := hi - 1; i >= 0; i-- {
+		data[first], data[first+i] = data[first+i], data[first]
+		siftDown(data, lo, i, first)
+	}
+}
+
+// siftDown implements the heap property on data[lo, hi).
+// first is an offset into the array where the root of the heap lies.
+func siftDown(data []literalNode, lo, hi, first int) {
+	root := lo
+	for {
+		child := 2*root + 1
+		if child >= hi {
+			break
+		}
+		if child+1 < hi && data[first+child].literal < data[first+child+1].literal {
+			child++
+		}
+		if data[first+root].literal > data[first+child].literal {
+			return
+		}
+		data[first+root], data[first+child] = data[first+child], data[first+root]
+		root = child
+	}
+}
+func doPivot(data []literalNode, lo, hi int) (midlo, midhi int) {
+	m := int(uint(lo+hi) >> 1) // Written like this to avoid integer overflow.
+	if hi-lo > 40 {
+		// Tukey's ``Ninther,'' median of three medians of three.
+		s := (hi - lo) / 8
+		medianOfThree(data, lo, lo+s, lo+2*s)
+		medianOfThree(data, m, m-s, m+s)
+		medianOfThree(data, hi-1, hi-1-s, hi-1-2*s)
+	}
+	medianOfThree(data, lo, m, hi-1)
+
+	// Invariants are:
+	//	data[lo] = pivot (set up by ChoosePivot)
+	//	data[lo < i < a] < pivot
+	//	data[a <= i < b] <= pivot
+	//	data[b <= i < c] unexamined
+	//	data[c <= i < hi-1] > pivot
+	//	data[hi-1] >= pivot
+	pivot := lo
+	a, c := lo+1, hi-1
+
+	for ; a < c && data[a].literal < data[pivot].literal; a++ {
+	}
+	b := a
+	for {
+		for ; b < c && data[pivot].literal > data[b].literal; b++ { // data[b] <= pivot
+		}
+		for ; b < c && data[pivot].literal < data[c-1].literal; c-- { // data[c-1] > pivot
+		}
+		if b >= c {
+			break
+		}
+		// data[b] > pivot; data[c-1] <= pivot
+		data[b], data[c-1] = data[c-1], data[b]
+		b++
+		c--
+	}
+	// If hi-c<3 then there are duplicates (by property of median of nine).
+	// Let's be a bit more conservative, and set border to 5.
+	protect := hi-c < 5
+	if !protect && hi-c < (hi-lo)/4 {
+		// Lets test some points for equality to pivot
+		dups := 0
+		if data[pivot].literal > data[hi-1].literal { // data[hi-1] = pivot
+			data[c], data[hi-1] = data[hi-1], data[c]
+			c++
+			dups++
+		}
+		if data[b-1].literal > data[pivot].literal { // data[b-1] = pivot
+			b--
+			dups++
+		}
+		// m-lo = (hi-lo)/2 > 6
+		// b-lo > (hi-lo)*3/4-1 > 8
+		// ==> m < b ==> data[m] <= pivot
+		if data[m].literal > data[pivot].literal { // data[m] = pivot
+			data[m], data[b-1] = data[b-1], data[m]
+			b--
+			dups++
+		}
+		// if at least 2 points are equal to pivot, assume skewed distribution
+		protect = dups > 1
+	}
+	if protect {
+		// Protect against a lot of duplicates
+		// Add invariant:
+		//	data[a <= i < b] unexamined
+		//	data[b <= i < c] = pivot
+		for {
+			for ; a < b && data[b-1].literal > data[pivot].literal; b-- { // data[b] == pivot
+			}
+			for ; a < b && data[a].literal < data[pivot].literal; a++ { // data[a] < pivot
+			}
+			if a >= b {
+				break
+			}
+			// data[a] == pivot; data[b-1] < pivot
+			data[a], data[b-1] = data[b-1], data[a]
+			a++
+			b--
+		}
+	}
+	// Swap pivot into middle
+	data[pivot], data[b-1] = data[b-1], data[pivot]
+	return b - 1, c
+}
+
+// Insertion sort
+func insertionSort(data []literalNode, a, b int) {
+	for i := a + 1; i < b; i++ {
+		for j := i; j > a && data[j].literal < data[j-1].literal; j-- {
+			data[j], data[j-1] = data[j-1], data[j]
+		}
+	}
+}
+
+// maxDepth returns a threshold at which quicksort should switch
+// to heapsort. It returns 2*ceil(lg(n+1)).
+func maxDepth(n int) int {
+	var depth int
+	for i := n; i > 0; i >>= 1 {
+		depth++
+	}
+	return depth * 2
+}
+
+// medianOfThree moves the median of the three values data[m0], data[m1], data[m2] into data[m1].
+func medianOfThree(data []literalNode, m1, m0, m2 int) {
+	// sort 3 elements
+	if data[m1].literal < data[m0].literal {
+		data[m1], data[m0] = data[m0], data[m1]
+	}
+	// data[m0] <= data[m1]
+	if data[m2].literal < data[m1].literal {
+		data[m2], data[m1] = data[m1], data[m2]
+		// data[m0] <= data[m2] && data[m1] < data[m2]
+		if data[m1].literal < data[m0].literal {
+			data[m1], data[m0] = data[m0], data[m1]
+		}
+	}
+	// now data[m0] <= data[m1] <= data[m2]
+}
diff --git a/vendor/github.com/klauspost/compress/flate/inflate.go b/vendor/github.com/klauspost/compress/flate/inflate.go
new file mode 100644
index 000000000..414c0bea9
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/inflate.go
@@ -0,0 +1,793 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package flate implements the DEFLATE compressed data format, described in
+// RFC 1951.  The gzip and zlib packages implement access to DEFLATE-based file
+// formats.
+package flate
+
+import (
+	"bufio"
+	"compress/flate"
+	"fmt"
+	"io"
+	"math/bits"
+	"sync"
+)
+
+const (
+	maxCodeLen     = 16 // max length of Huffman code
+	maxCodeLenMask = 15 // mask for max length of Huffman code
+	// The next three numbers come from the RFC section 3.2.7, with the
+	// additional proviso in section 3.2.5 which implies that distance codes
+	// 30 and 31 should never occur in compressed data.
+	maxNumLit  = 286
+	maxNumDist = 30
+	numCodes   = 19 // number of codes in Huffman meta-code
+
+	debugDecode = false
+)
+
+// Value of length - 3 and extra bits.
+type lengthExtra struct {
+	length, extra uint8
+}
+
+var decCodeToLen = [32]lengthExtra{{length: 0x0, extra: 0x0}, {length: 0x1, extra: 0x0}, {length: 0x2, extra: 0x0}, {length: 0x3, extra: 0x0}, {length: 0x4, extra: 0x0}, {length: 0x5, extra: 0x0}, {length: 0x6, extra: 0x0}, {length: 0x7, extra: 0x0}, {length: 0x8, extra: 0x1}, {length: 0xa, extra: 0x1}, {length: 0xc, extra: 0x1}, {length: 0xe, extra: 0x1}, {length: 0x10, extra: 0x2}, {length: 0x14, extra: 0x2}, {length: 0x18, extra: 0x2}, {length: 0x1c, extra: 0x2}, {length: 0x20, extra: 0x3}, {length: 0x28, extra: 0x3}, {length: 0x30, extra: 0x3}, {length: 0x38, extra: 0x3}, {length: 0x40, extra: 0x4}, {length: 0x50, extra: 0x4}, {length: 0x60, extra: 0x4}, {length: 0x70, extra: 0x4}, {length: 0x80, extra: 0x5}, {length: 0xa0, extra: 0x5}, {length: 0xc0, extra: 0x5}, {length: 0xe0, extra: 0x5}, {length: 0xff, extra: 0x0}, {length: 0x0, extra: 0x0}, {length: 0x0, extra: 0x0}, {length: 0x0, extra: 0x0}}
+
+var bitMask32 = [32]uint32{
+	0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF,
+	0x1FF, 0x3FF, 0x7FF, 0xFFF, 0x1FFF, 0x3FFF, 0x7FFF, 0xFFFF,
+	0x1ffff, 0x3ffff, 0x7FFFF, 0xfFFFF, 0x1fFFFF, 0x3fFFFF, 0x7fFFFF, 0xffFFFF,
+	0x1ffFFFF, 0x3ffFFFF, 0x7ffFFFF, 0xfffFFFF, 0x1fffFFFF, 0x3fffFFFF, 0x7fffFFFF,
+} // up to 32 bits
+
+// Initialize the fixedHuffmanDecoder only once upon first use.
+var fixedOnce sync.Once
+var fixedHuffmanDecoder huffmanDecoder
+
+// A CorruptInputError reports the presence of corrupt input at a given offset.
+type CorruptInputError = flate.CorruptInputError
+
+// An InternalError reports an error in the flate code itself.
+type InternalError string
+
+func (e InternalError) Error() string { return "flate: internal error: " + string(e) }
+
+// A ReadError reports an error encountered while reading input.
+//
+// Deprecated: No longer returned.
+type ReadError = flate.ReadError
+
+// A WriteError reports an error encountered while writing output.
+//
+// Deprecated: No longer returned.
+type WriteError = flate.WriteError
+
+// Resetter resets a ReadCloser returned by NewReader or NewReaderDict to
+// to switch to a new underlying Reader. This permits reusing a ReadCloser
+// instead of allocating a new one.
+type Resetter interface {
+	// Reset discards any buffered data and resets the Resetter as if it was
+	// newly initialized with the given reader.
+	Reset(r io.Reader, dict []byte) error
+}
+
+// The data structure for decoding Huffman tables is based on that of
+// zlib. There is a lookup table of a fixed bit width (huffmanChunkBits),
+// For codes smaller than the table width, there are multiple entries
+// (each combination of trailing bits has the same value). For codes
+// larger than the table width, the table contains a link to an overflow
+// table. The width of each entry in the link table is the maximum code
+// size minus the chunk width.
+//
+// Note that you can do a lookup in the table even without all bits
+// filled. Since the extra bits are zero, and the DEFLATE Huffman codes
+// have the property that shorter codes come before longer ones, the
+// bit length estimate in the result is a lower bound on the actual
+// number of bits.
+//
+// See the following:
+//	http://www.gzip.org/algorithm.txt
+
+// chunk & 15 is number of bits
+// chunk >> 4 is value, including table link
+
+const (
+	huffmanChunkBits  = 9
+	huffmanNumChunks  = 1 << huffmanChunkBits
+	huffmanCountMask  = 15
+	huffmanValueShift = 4
+)
+
+type huffmanDecoder struct {
+	maxRead  int                       // the maximum number of bits we can read and not overread
+	chunks   *[huffmanNumChunks]uint16 // chunks as described above
+	links    [][]uint16                // overflow links
+	linkMask uint32                    // mask the width of the link table
+}
+
+// Initialize Huffman decoding tables from array of code lengths.
+// Following this function, h is guaranteed to be initialized into a complete
+// tree (i.e., neither over-subscribed nor under-subscribed). The exception is a
+// degenerate case where the tree has only a single symbol with length 1. Empty
+// trees are permitted.
+func (h *huffmanDecoder) init(lengths []int) bool {
+	// Sanity enables additional runtime tests during Huffman
+	// table construction. It's intended to be used during
+	// development to supplement the currently ad-hoc unit tests.
+	const sanity = false
+
+	if h.chunks == nil {
+		h.chunks = &[huffmanNumChunks]uint16{}
+	}
+	if h.maxRead != 0 {
+		*h = huffmanDecoder{chunks: h.chunks, links: h.links}
+	}
+
+	// Count number of codes of each length,
+	// compute maxRead and max length.
+	var count [maxCodeLen]int
+	var min, max int
+	for _, n := range lengths {
+		if n == 0 {
+			continue
+		}
+		if min == 0 || n < min {
+			min = n
+		}
+		if n > max {
+			max = n
+		}
+		count[n&maxCodeLenMask]++
+	}
+
+	// Empty tree. The decompressor.huffSym function will fail later if the tree
+	// is used. Technically, an empty tree is only valid for the HDIST tree and
+	// not the HCLEN and HLIT tree. However, a stream with an empty HCLEN tree
+	// is guaranteed to fail since it will attempt to use the tree to decode the
+	// codes for the HLIT and HDIST trees. Similarly, an empty HLIT tree is
+	// guaranteed to fail later since the compressed data section must be
+	// composed of at least one symbol (the end-of-block marker).
+	if max == 0 {
+		return true
+	}
+
+	code := 0
+	var nextcode [maxCodeLen]int
+	for i := min; i <= max; i++ {
+		code <<= 1
+		nextcode[i&maxCodeLenMask] = code
+		code += count[i&maxCodeLenMask]
+	}
+
+	// Check that the coding is complete (i.e., that we've
+	// assigned all 2-to-the-max possible bit sequences).
+	// Exception: To be compatible with zlib, we also need to
+	// accept degenerate single-code codings. See also
+	// TestDegenerateHuffmanCoding.
+	if code != 1<<uint(max) && !(code == 1 && max == 1) {
+		if debugDecode {
+			fmt.Println("coding failed, code, max:", code, max, code == 1<<uint(max), code == 1 && max == 1, "(one should be true)")
+		}
+		return false
+	}
+
+	h.maxRead = min
+	chunks := h.chunks[:]
+	for i := range chunks {
+		chunks[i] = 0
+	}
+
+	if max > huffmanChunkBits {
+		numLinks := 1 << (uint(max) - huffmanChunkBits)
+		h.linkMask = uint32(numLinks - 1)
+
+		// create link tables
+		link := nextcode[huffmanChunkBits+1] >> 1
+		if cap(h.links) < huffmanNumChunks-link {
+			h.links = make([][]uint16, huffmanNumChunks-link)
+		} else {
+			h.links = h.links[:huffmanNumChunks-link]
+		}
+		for j := uint(link); j < huffmanNumChunks; j++ {
+			reverse := int(bits.Reverse16(uint16(j)))
+			reverse >>= uint(16 - huffmanChunkBits)
+			off := j - uint(link)
+			if sanity && h.chunks[reverse] != 0 {
+				panic("impossible: overwriting existing chunk")
+			}
+			h.chunks[reverse] = uint16(off<<huffmanValueShift | (huffmanChunkBits + 1))
+			if cap(h.links[off]) < numLinks {
+				h.links[off] = make([]uint16, numLinks)
+			} else {
+				links := h.links[off][:0]
+				h.links[off] = links[:numLinks]
+			}
+		}
+	} else {
+		h.links = h.links[:0]
+	}
+
+	for i, n := range lengths {
+		if n == 0 {
+			continue
+		}
+		code := nextcode[n]
+		nextcode[n]++
+		chunk := uint16(i<<huffmanValueShift | n)
+		reverse := int(bits.Reverse16(uint16(code)))
+		reverse >>= uint(16 - n)
+		if n <= huffmanChunkBits {
+			for off := reverse; off < len(h.chunks); off += 1 << uint(n) {
+				// We should never need to overwrite
+				// an existing chunk. Also, 0 is
+				// never a valid chunk, because the
+				// lower 4 "count" bits should be
+				// between 1 and 15.
+				if sanity && h.chunks[off] != 0 {
+					panic("impossible: overwriting existing chunk")
+				}
+				h.chunks[off] = chunk
+			}
+		} else {
+			j := reverse & (huffmanNumChunks - 1)
+			if sanity && h.chunks[j]&huffmanCountMask != huffmanChunkBits+1 {
+				// Longer codes should have been
+				// associated with a link table above.
+				panic("impossible: not an indirect chunk")
+			}
+			value := h.chunks[j] >> huffmanValueShift
+			linktab := h.links[value]
+			reverse >>= huffmanChunkBits
+			for off := reverse; off < len(linktab); off += 1 << uint(n-huffmanChunkBits) {
+				if sanity && linktab[off] != 0 {
+					panic("impossible: overwriting existing chunk")
+				}
+				linktab[off] = chunk
+			}
+		}
+	}
+
+	if sanity {
+		// Above we've sanity checked that we never overwrote
+		// an existing entry. Here we additionally check that
+		// we filled the tables completely.
+		for i, chunk := range h.chunks {
+			if chunk == 0 {
+				// As an exception, in the degenerate
+				// single-code case, we allow odd
+				// chunks to be missing.
+				if code == 1 && i%2 == 1 {
+					continue
+				}
+				panic("impossible: missing chunk")
+			}
+		}
+		for _, linktab := range h.links {
+			for _, chunk := range linktab {
+				if chunk == 0 {
+					panic("impossible: missing chunk")
+				}
+			}
+		}
+	}
+
+	return true
+}
+
+// The actual read interface needed by NewReader.
+// If the passed in io.Reader does not also have ReadByte,
+// the NewReader will introduce its own buffering.
+type Reader interface {
+	io.Reader
+	io.ByteReader
+}
+
+// Decompress state.
+type decompressor struct {
+	// Input source.
+	r       Reader
+	roffset int64
+
+	// Huffman decoders for literal/length, distance.
+	h1, h2 huffmanDecoder
+
+	// Length arrays used to define Huffman codes.
+	bits     *[maxNumLit + maxNumDist]int
+	codebits *[numCodes]int
+
+	// Output history, buffer.
+	dict dictDecoder
+
+	// Next step in the decompression,
+	// and decompression state.
+	step      func(*decompressor)
+	stepState int
+	err       error
+	toRead    []byte
+	hl, hd    *huffmanDecoder
+	copyLen   int
+	copyDist  int
+
+	// Temporary buffer (avoids repeated allocation).
+	buf [4]byte
+
+	// Input bits, in top of b.
+	b uint32
+
+	nb    uint
+	final bool
+}
+
+func (f *decompressor) nextBlock() {
+	for f.nb < 1+2 {
+		if f.err = f.moreBits(); f.err != nil {
+			return
+		}
+	}
+	f.final = f.b&1 == 1
+	f.b >>= 1
+	typ := f.b & 3
+	f.b >>= 2
+	f.nb -= 1 + 2
+	switch typ {
+	case 0:
+		f.dataBlock()
+		if debugDecode {
+			fmt.Println("stored block")
+		}
+	case 1:
+		// compressed, fixed Huffman tables
+		f.hl = &fixedHuffmanDecoder
+		f.hd = nil
+		f.huffmanBlockDecoder()()
+		if debugDecode {
+			fmt.Println("predefinied huffman block")
+		}
+	case 2:
+		// compressed, dynamic Huffman tables
+		if f.err = f.readHuffman(); f.err != nil {
+			break
+		}
+		f.hl = &f.h1
+		f.hd = &f.h2
+		f.huffmanBlockDecoder()()
+		if debugDecode {
+			fmt.Println("dynamic huffman block")
+		}
+	default:
+		// 3 is reserved.
+		if debugDecode {
+			fmt.Println("reserved data block encountered")
+		}
+		f.err = CorruptInputError(f.roffset)
+	}
+}
+
+func (f *decompressor) Read(b []byte) (int, error) {
+	for {
+		if len(f.toRead) > 0 {
+			n := copy(b, f.toRead)
+			f.toRead = f.toRead[n:]
+			if len(f.toRead) == 0 {
+				return n, f.err
+			}
+			return n, nil
+		}
+		if f.err != nil {
+			return 0, f.err
+		}
+		f.step(f)
+		if f.err != nil && len(f.toRead) == 0 {
+			f.toRead = f.dict.readFlush() // Flush what's left in case of error
+		}
+	}
+}
+
+// Support the io.WriteTo interface for io.Copy and friends.
+func (f *decompressor) WriteTo(w io.Writer) (int64, error) {
+	total := int64(0)
+	flushed := false
+	for {
+		if len(f.toRead) > 0 {
+			n, err := w.Write(f.toRead)
+			total += int64(n)
+			if err != nil {
+				f.err = err
+				return total, err
+			}
+			if n != len(f.toRead) {
+				return total, io.ErrShortWrite
+			}
+			f.toRead = f.toRead[:0]
+		}
+		if f.err != nil && flushed {
+			if f.err == io.EOF {
+				return total, nil
+			}
+			return total, f.err
+		}
+		if f.err == nil {
+			f.step(f)
+		}
+		if len(f.toRead) == 0 && f.err != nil && !flushed {
+			f.toRead = f.dict.readFlush() // Flush what's left in case of error
+			flushed = true
+		}
+	}
+}
+
+func (f *decompressor) Close() error {
+	if f.err == io.EOF {
+		return nil
+	}
+	return f.err
+}
+
+// RFC 1951 section 3.2.7.
+// Compression with dynamic Huffman codes
+
+var codeOrder = [...]int{16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15}
+
+func (f *decompressor) readHuffman() error {
+	// HLIT[5], HDIST[5], HCLEN[4].
+	for f.nb < 5+5+4 {
+		if err := f.moreBits(); err != nil {
+			return err
+		}
+	}
+	nlit := int(f.b&0x1F) + 257
+	if nlit > maxNumLit {
+		if debugDecode {
+			fmt.Println("nlit > maxNumLit", nlit)
+		}
+		return CorruptInputError(f.roffset)
+	}
+	f.b >>= 5
+	ndist := int(f.b&0x1F) + 1
+	if ndist > maxNumDist {
+		if debugDecode {
+			fmt.Println("ndist > maxNumDist", ndist)
+		}
+		return CorruptInputError(f.roffset)
+	}
+	f.b >>= 5
+	nclen := int(f.b&0xF) + 4
+	// numCodes is 19, so nclen is always valid.
+	f.b >>= 4
+	f.nb -= 5 + 5 + 4
+
+	// (HCLEN+4)*3 bits: code lengths in the magic codeOrder order.
+	for i := 0; i < nclen; i++ {
+		for f.nb < 3 {
+			if err := f.moreBits(); err != nil {
+				return err
+			}
+		}
+		f.codebits[codeOrder[i]] = int(f.b & 0x7)
+		f.b >>= 3
+		f.nb -= 3
+	}
+	for i := nclen; i < len(codeOrder); i++ {
+		f.codebits[codeOrder[i]] = 0
+	}
+	if !f.h1.init(f.codebits[0:]) {
+		if debugDecode {
+			fmt.Println("init codebits failed")
+		}
+		return CorruptInputError(f.roffset)
+	}
+
+	// HLIT + 257 code lengths, HDIST + 1 code lengths,
+	// using the code length Huffman code.
+	for i, n := 0, nlit+ndist; i < n; {
+		x, err := f.huffSym(&f.h1)
+		if err != nil {
+			return err
+		}
+		if x < 16 {
+			// Actual length.
+			f.bits[i] = x
+			i++
+			continue
+		}
+		// Repeat previous length or zero.
+		var rep int
+		var nb uint
+		var b int
+		switch x {
+		default:
+			return InternalError("unexpected length code")
+		case 16:
+			rep = 3
+			nb = 2
+			if i == 0 {
+				if debugDecode {
+					fmt.Println("i==0")
+				}
+				return CorruptInputError(f.roffset)
+			}
+			b = f.bits[i-1]
+		case 17:
+			rep = 3
+			nb = 3
+			b = 0
+		case 18:
+			rep = 11
+			nb = 7
+			b = 0
+		}
+		for f.nb < nb {
+			if err := f.moreBits(); err != nil {
+				if debugDecode {
+					fmt.Println("morebits:", err)
+				}
+				return err
+			}
+		}
+		rep += int(f.b & uint32(1<<(nb&regSizeMaskUint32)-1))
+		f.b >>= nb & regSizeMaskUint32
+		f.nb -= nb
+		if i+rep > n {
+			if debugDecode {
+				fmt.Println("i+rep > n", i, rep, n)
+			}
+			return CorruptInputError(f.roffset)
+		}
+		for j := 0; j < rep; j++ {
+			f.bits[i] = b
+			i++
+		}
+	}
+
+	if !f.h1.init(f.bits[0:nlit]) || !f.h2.init(f.bits[nlit:nlit+ndist]) {
+		if debugDecode {
+			fmt.Println("init2 failed")
+		}
+		return CorruptInputError(f.roffset)
+	}
+
+	// As an optimization, we can initialize the maxRead bits to read at a time
+	// for the HLIT tree to the length of the EOB marker since we know that
+	// every block must terminate with one. This preserves the property that
+	// we never read any extra bytes after the end of the DEFLATE stream.
+	if f.h1.maxRead < f.bits[endBlockMarker] {
+		f.h1.maxRead = f.bits[endBlockMarker]
+	}
+	if !f.final {
+		// If not the final block, the smallest block possible is
+		// a predefined table, BTYPE=01, with a single EOB marker.
+		// This will take up 3 + 7 bits.
+		f.h1.maxRead += 10
+	}
+
+	return nil
+}
+
+// Copy a single uncompressed data block from input to output.
+func (f *decompressor) dataBlock() {
+	// Uncompressed.
+	// Discard current half-byte.
+	left := (f.nb) & 7
+	f.nb -= left
+	f.b >>= left
+
+	offBytes := f.nb >> 3
+	// Unfilled values will be overwritten.
+	f.buf[0] = uint8(f.b)
+	f.buf[1] = uint8(f.b >> 8)
+	f.buf[2] = uint8(f.b >> 16)
+	f.buf[3] = uint8(f.b >> 24)
+
+	f.roffset += int64(offBytes)
+	f.nb, f.b = 0, 0
+
+	// Length then ones-complement of length.
+	nr, err := io.ReadFull(f.r, f.buf[offBytes:4])
+	f.roffset += int64(nr)
+	if err != nil {
+		f.err = noEOF(err)
+		return
+	}
+	n := uint16(f.buf[0]) | uint16(f.buf[1])<<8
+	nn := uint16(f.buf[2]) | uint16(f.buf[3])<<8
+	if nn != ^n {
+		if debugDecode {
+			ncomp := ^n
+			fmt.Println("uint16(nn) != uint16(^n)", nn, ncomp)
+		}
+		f.err = CorruptInputError(f.roffset)
+		return
+	}
+
+	if n == 0 {
+		f.toRead = f.dict.readFlush()
+		f.finishBlock()
+		return
+	}
+
+	f.copyLen = int(n)
+	f.copyData()
+}
+
+// copyData copies f.copyLen bytes from the underlying reader into f.hist.
+// It pauses for reads when f.hist is full.
+func (f *decompressor) copyData() {
+	buf := f.dict.writeSlice()
+	if len(buf) > f.copyLen {
+		buf = buf[:f.copyLen]
+	}
+
+	cnt, err := io.ReadFull(f.r, buf)
+	f.roffset += int64(cnt)
+	f.copyLen -= cnt
+	f.dict.writeMark(cnt)
+	if err != nil {
+		f.err = noEOF(err)
+		return
+	}
+
+	if f.dict.availWrite() == 0 || f.copyLen > 0 {
+		f.toRead = f.dict.readFlush()
+		f.step = (*decompressor).copyData
+		return
+	}
+	f.finishBlock()
+}
+
+func (f *decompressor) finishBlock() {
+	if f.final {
+		if f.dict.availRead() > 0 {
+			f.toRead = f.dict.readFlush()
+		}
+		f.err = io.EOF
+	}
+	f.step = (*decompressor).nextBlock
+}
+
+// noEOF returns err, unless err == io.EOF, in which case it returns io.ErrUnexpectedEOF.
+func noEOF(e error) error {
+	if e == io.EOF {
+		return io.ErrUnexpectedEOF
+	}
+	return e
+}
+
+func (f *decompressor) moreBits() error {
+	c, err := f.r.ReadByte()
+	if err != nil {
+		return noEOF(err)
+	}
+	f.roffset++
+	f.b |= uint32(c) << (f.nb & regSizeMaskUint32)
+	f.nb += 8
+	return nil
+}
+
+// Read the next Huffman-encoded symbol from f according to h.
+func (f *decompressor) huffSym(h *huffmanDecoder) (int, error) {
+	// Since a huffmanDecoder can be empty or be composed of a degenerate tree
+	// with single element, huffSym must error on these two edge cases. In both
+	// cases, the chunks slice will be 0 for the invalid sequence, leading it
+	// satisfy the n == 0 check below.
+	n := uint(h.maxRead)
+	// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+	// but is smart enough to keep local variables in registers, so use nb and b,
+	// inline call to moreBits and reassign b,nb back to f on return.
+	nb, b := f.nb, f.b
+	for {
+		for nb < n {
+			c, err := f.r.ReadByte()
+			if err != nil {
+				f.b = b
+				f.nb = nb
+				return 0, noEOF(err)
+			}
+			f.roffset++
+			b |= uint32(c) << (nb & regSizeMaskUint32)
+			nb += 8
+		}
+		chunk := h.chunks[b&(huffmanNumChunks-1)]
+		n = uint(chunk & huffmanCountMask)
+		if n > huffmanChunkBits {
+			chunk = h.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&h.linkMask]
+			n = uint(chunk & huffmanCountMask)
+		}
+		if n <= nb {
+			if n == 0 {
+				f.b = b
+				f.nb = nb
+				if debugDecode {
+					fmt.Println("huffsym: n==0")
+				}
+				f.err = CorruptInputError(f.roffset)
+				return 0, f.err
+			}
+			f.b = b >> (n & regSizeMaskUint32)
+			f.nb = nb - n
+			return int(chunk >> huffmanValueShift), nil
+		}
+	}
+}
+
+func makeReader(r io.Reader) Reader {
+	if rr, ok := r.(Reader); ok {
+		return rr
+	}
+	return bufio.NewReader(r)
+}
+
+func fixedHuffmanDecoderInit() {
+	fixedOnce.Do(func() {
+		// These come from the RFC section 3.2.6.
+		var bits [288]int
+		for i := 0; i < 144; i++ {
+			bits[i] = 8
+		}
+		for i := 144; i < 256; i++ {
+			bits[i] = 9
+		}
+		for i := 256; i < 280; i++ {
+			bits[i] = 7
+		}
+		for i := 280; i < 288; i++ {
+			bits[i] = 8
+		}
+		fixedHuffmanDecoder.init(bits[:])
+	})
+}
+
+func (f *decompressor) Reset(r io.Reader, dict []byte) error {
+	*f = decompressor{
+		r:        makeReader(r),
+		bits:     f.bits,
+		codebits: f.codebits,
+		h1:       f.h1,
+		h2:       f.h2,
+		dict:     f.dict,
+		step:     (*decompressor).nextBlock,
+	}
+	f.dict.init(maxMatchOffset, dict)
+	return nil
+}
+
+// NewReader returns a new ReadCloser that can be used
+// to read the uncompressed version of r.
+// If r does not also implement io.ByteReader,
+// the decompressor may read more data than necessary from r.
+// It is the caller's responsibility to call Close on the ReadCloser
+// when finished reading.
+//
+// The ReadCloser returned by NewReader also implements Resetter.
+func NewReader(r io.Reader) io.ReadCloser {
+	fixedHuffmanDecoderInit()
+
+	var f decompressor
+	f.r = makeReader(r)
+	f.bits = new([maxNumLit + maxNumDist]int)
+	f.codebits = new([numCodes]int)
+	f.step = (*decompressor).nextBlock
+	f.dict.init(maxMatchOffset, nil)
+	return &f
+}
+
+// NewReaderDict is like NewReader but initializes the reader
+// with a preset dictionary. The returned Reader behaves as if
+// the uncompressed data stream started with the given dictionary,
+// which has already been read. NewReaderDict is typically used
+// to read data compressed by NewWriterDict.
+//
+// The ReadCloser returned by NewReader also implements Resetter.
+func NewReaderDict(r io.Reader, dict []byte) io.ReadCloser {
+	fixedHuffmanDecoderInit()
+
+	var f decompressor
+	f.r = makeReader(r)
+	f.bits = new([maxNumLit + maxNumDist]int)
+	f.codebits = new([numCodes]int)
+	f.step = (*decompressor).nextBlock
+	f.dict.init(maxMatchOffset, dict)
+	return &f
+}
diff --git a/vendor/github.com/klauspost/compress/flate/inflate_gen.go b/vendor/github.com/klauspost/compress/flate/inflate_gen.go
new file mode 100644
index 000000000..61342b6b8
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/inflate_gen.go
@@ -0,0 +1,1283 @@
+// Code generated by go generate gen_inflate.go. DO NOT EDIT.
+
+package flate
+
+import (
+	"bufio"
+	"bytes"
+	"fmt"
+	"math/bits"
+	"strings"
+)
+
+// Decode a single Huffman block from f.
+// hl and hd are the Huffman states for the lit/length values
+// and the distance values, respectively. If hd == nil, using the
+// fixed distance encoding associated with fixed Huffman blocks.
+func (f *decompressor) huffmanBytesBuffer() {
+	const (
+		stateInit = iota // Zero value must be stateInit
+		stateDict
+	)
+	fr := f.r.(*bytes.Buffer)
+
+	// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+	// but is smart enough to keep local variables in registers, so use nb and b,
+	// inline call to moreBits and reassign b,nb back to f on return.
+	fnb, fb, dict := f.nb, f.b, &f.dict
+
+	switch f.stepState {
+	case stateInit:
+		goto readLiteral
+	case stateDict:
+		goto copyHistory
+	}
+
+readLiteral:
+	// Read literal and/or (length, distance) according to RFC section 3.2.3.
+	{
+		var v int
+		{
+			// Inlined v, err := f.huffSym(f.hl)
+			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
+			// with single element, huffSym must error on these two edge cases. In both
+			// cases, the chunks slice will be 0 for the invalid sequence, leading it
+			// satisfy the n == 0 check below.
+			n := uint(f.hl.maxRead)
+			for {
+				for fnb < n {
+					c, err := fr.ReadByte()
+					if err != nil {
+						f.b, f.nb = fb, fnb
+						f.err = noEOF(err)
+						return
+					}
+					f.roffset++
+					fb |= uint32(c) << (fnb & regSizeMaskUint32)
+					fnb += 8
+				}
+				chunk := f.hl.chunks[fb&(huffmanNumChunks-1)]
+				n = uint(chunk & huffmanCountMask)
+				if n > huffmanChunkBits {
+					chunk = f.hl.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hl.linkMask]
+					n = uint(chunk & huffmanCountMask)
+				}
+				if n <= fnb {
+					if n == 0 {
+						f.b, f.nb = fb, fnb
+						if debugDecode {
+							fmt.Println("huffsym: n==0")
+						}
+						f.err = CorruptInputError(f.roffset)
+						return
+					}
+					fb = fb >> (n & regSizeMaskUint32)
+					fnb = fnb - n
+					v = int(chunk >> huffmanValueShift)
+					break
+				}
+			}
+		}
+
+		var length int
+		switch {
+		case v < 256:
+			dict.writeByte(byte(v))
+			if dict.availWrite() == 0 {
+				f.toRead = dict.readFlush()
+				f.step = (*decompressor).huffmanBytesBuffer
+				f.stepState = stateInit
+				f.b, f.nb = fb, fnb
+				return
+			}
+			goto readLiteral
+		case v == 256:
+			f.b, f.nb = fb, fnb
+			f.finishBlock()
+			return
+		// otherwise, reference to older data
+		case v < 265:
+			length = v - (257 - 3)
+		case v < maxNumLit:
+			val := decCodeToLen[(v - 257)]
+			length = int(val.length) + 3
+			n := uint(val.extra)
+			for fnb < n {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
+					if debugDecode {
+						fmt.Println("morebits n>0:", err)
+					}
+					f.err = err
+					return
+				}
+				f.roffset++
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
+			}
+			length += int(fb & bitMask32[n])
+			fb >>= n & regSizeMaskUint32
+			fnb -= n
+		default:
+			if debugDecode {
+				fmt.Println(v, ">= maxNumLit")
+			}
+			f.err = CorruptInputError(f.roffset)
+			f.b, f.nb = fb, fnb
+			return
+		}
+
+		var dist uint32
+		if f.hd == nil {
+			for fnb < 5 {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
+					if debugDecode {
+						fmt.Println("morebits f.nb<5:", err)
+					}
+					f.err = err
+					return
+				}
+				f.roffset++
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
+			}
+			dist = uint32(bits.Reverse8(uint8(fb & 0x1F << 3)))
+			fb >>= 5
+			fnb -= 5
+		} else {
+			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
+			// with single element, huffSym must error on these two edge cases. In both
+			// cases, the chunks slice will be 0 for the invalid sequence, leading it
+			// satisfy the n == 0 check below.
+			n := uint(f.hd.maxRead)
+			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+			// but is smart enough to keep local variables in registers, so use nb and b,
+			// inline call to moreBits and reassign b,nb back to f on return.
+			for {
+				for fnb < n {
+					c, err := fr.ReadByte()
+					if err != nil {
+						f.b, f.nb = fb, fnb
+						f.err = noEOF(err)
+						return
+					}
+					f.roffset++
+					fb |= uint32(c) << (fnb & regSizeMaskUint32)
+					fnb += 8
+				}
+				chunk := f.hd.chunks[fb&(huffmanNumChunks-1)]
+				n = uint(chunk & huffmanCountMask)
+				if n > huffmanChunkBits {
+					chunk = f.hd.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hd.linkMask]
+					n = uint(chunk & huffmanCountMask)
+				}
+				if n <= fnb {
+					if n == 0 {
+						f.b, f.nb = fb, fnb
+						if debugDecode {
+							fmt.Println("huffsym: n==0")
+						}
+						f.err = CorruptInputError(f.roffset)
+						return
+					}
+					fb = fb >> (n & regSizeMaskUint32)
+					fnb = fnb - n
+					dist = uint32(chunk >> huffmanValueShift)
+					break
+				}
+			}
+		}
+
+		switch {
+		case dist < 4:
+			dist++
+		case dist < maxNumDist:
+			nb := uint(dist-2) >> 1
+			// have 1 bit in bottom of dist, need nb more.
+			extra := (dist & 1) << (nb & regSizeMaskUint32)
+			for fnb < nb {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
+					if debugDecode {
+						fmt.Println("morebits f.nb<nb:", err)
+					}
+					f.err = err
+					return
+				}
+				f.roffset++
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
+			}
+			extra |= fb & bitMask32[nb]
+			fb >>= nb & regSizeMaskUint32
+			fnb -= nb
+			dist = 1<<((nb+1)&regSizeMaskUint32) + 1 + extra
+			// slower: dist = bitMask32[nb+1] + 2 + extra
+		default:
+			f.b, f.nb = fb, fnb
+			if debugDecode {
+				fmt.Println("dist too big:", dist, maxNumDist)
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		// No check on length; encoding can be prescient.
+		if dist > uint32(dict.histSize()) {
+			f.b, f.nb = fb, fnb
+			if debugDecode {
+				fmt.Println("dist > dict.histSize():", dist, dict.histSize())
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		f.copyLen, f.copyDist = length, int(dist)
+		goto copyHistory
+	}
+
+copyHistory:
+	// Perform a backwards copy according to RFC section 3.2.3.
+	{
+		cnt := dict.tryWriteCopy(f.copyDist, f.copyLen)
+		if cnt == 0 {
+			cnt = dict.writeCopy(f.copyDist, f.copyLen)
+		}
+		f.copyLen -= cnt
+
+		if dict.availWrite() == 0 || f.copyLen > 0 {
+			f.toRead = dict.readFlush()
+			f.step = (*decompressor).huffmanBytesBuffer // We need to continue this work
+			f.stepState = stateDict
+			f.b, f.nb = fb, fnb
+			return
+		}
+		goto readLiteral
+	}
+	// Not reached
+}
+
+// Decode a single Huffman block from f.
+// hl and hd are the Huffman states for the lit/length values
+// and the distance values, respectively. If hd == nil, using the
+// fixed distance encoding associated with fixed Huffman blocks.
+func (f *decompressor) huffmanBytesReader() {
+	const (
+		stateInit = iota // Zero value must be stateInit
+		stateDict
+	)
+	fr := f.r.(*bytes.Reader)
+
+	// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+	// but is smart enough to keep local variables in registers, so use nb and b,
+	// inline call to moreBits and reassign b,nb back to f on return.
+	fnb, fb, dict := f.nb, f.b, &f.dict
+
+	switch f.stepState {
+	case stateInit:
+		goto readLiteral
+	case stateDict:
+		goto copyHistory
+	}
+
+readLiteral:
+	// Read literal and/or (length, distance) according to RFC section 3.2.3.
+	{
+		var v int
+		{
+			// Inlined v, err := f.huffSym(f.hl)
+			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
+			// with single element, huffSym must error on these two edge cases. In both
+			// cases, the chunks slice will be 0 for the invalid sequence, leading it
+			// satisfy the n == 0 check below.
+			n := uint(f.hl.maxRead)
+			for {
+				for fnb < n {
+					c, err := fr.ReadByte()
+					if err != nil {
+						f.b, f.nb = fb, fnb
+						f.err = noEOF(err)
+						return
+					}
+					f.roffset++
+					fb |= uint32(c) << (fnb & regSizeMaskUint32)
+					fnb += 8
+				}
+				chunk := f.hl.chunks[fb&(huffmanNumChunks-1)]
+				n = uint(chunk & huffmanCountMask)
+				if n > huffmanChunkBits {
+					chunk = f.hl.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hl.linkMask]
+					n = uint(chunk & huffmanCountMask)
+				}
+				if n <= fnb {
+					if n == 0 {
+						f.b, f.nb = fb, fnb
+						if debugDecode {
+							fmt.Println("huffsym: n==0")
+						}
+						f.err = CorruptInputError(f.roffset)
+						return
+					}
+					fb = fb >> (n & regSizeMaskUint32)
+					fnb = fnb - n
+					v = int(chunk >> huffmanValueShift)
+					break
+				}
+			}
+		}
+
+		var length int
+		switch {
+		case v < 256:
+			dict.writeByte(byte(v))
+			if dict.availWrite() == 0 {
+				f.toRead = dict.readFlush()
+				f.step = (*decompressor).huffmanBytesReader
+				f.stepState = stateInit
+				f.b, f.nb = fb, fnb
+				return
+			}
+			goto readLiteral
+		case v == 256:
+			f.b, f.nb = fb, fnb
+			f.finishBlock()
+			return
+		// otherwise, reference to older data
+		case v < 265:
+			length = v - (257 - 3)
+		case v < maxNumLit:
+			val := decCodeToLen[(v - 257)]
+			length = int(val.length) + 3
+			n := uint(val.extra)
+			for fnb < n {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
+					if debugDecode {
+						fmt.Println("morebits n>0:", err)
+					}
+					f.err = err
+					return
+				}
+				f.roffset++
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
+			}
+			length += int(fb & bitMask32[n])
+			fb >>= n & regSizeMaskUint32
+			fnb -= n
+		default:
+			if debugDecode {
+				fmt.Println(v, ">= maxNumLit")
+			}
+			f.err = CorruptInputError(f.roffset)
+			f.b, f.nb = fb, fnb
+			return
+		}
+
+		var dist uint32
+		if f.hd == nil {
+			for fnb < 5 {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
+					if debugDecode {
+						fmt.Println("morebits f.nb<5:", err)
+					}
+					f.err = err
+					return
+				}
+				f.roffset++
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
+			}
+			dist = uint32(bits.Reverse8(uint8(fb & 0x1F << 3)))
+			fb >>= 5
+			fnb -= 5
+		} else {
+			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
+			// with single element, huffSym must error on these two edge cases. In both
+			// cases, the chunks slice will be 0 for the invalid sequence, leading it
+			// satisfy the n == 0 check below.
+			n := uint(f.hd.maxRead)
+			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+			// but is smart enough to keep local variables in registers, so use nb and b,
+			// inline call to moreBits and reassign b,nb back to f on return.
+			for {
+				for fnb < n {
+					c, err := fr.ReadByte()
+					if err != nil {
+						f.b, f.nb = fb, fnb
+						f.err = noEOF(err)
+						return
+					}
+					f.roffset++
+					fb |= uint32(c) << (fnb & regSizeMaskUint32)
+					fnb += 8
+				}
+				chunk := f.hd.chunks[fb&(huffmanNumChunks-1)]
+				n = uint(chunk & huffmanCountMask)
+				if n > huffmanChunkBits {
+					chunk = f.hd.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hd.linkMask]
+					n = uint(chunk & huffmanCountMask)
+				}
+				if n <= fnb {
+					if n == 0 {
+						f.b, f.nb = fb, fnb
+						if debugDecode {
+							fmt.Println("huffsym: n==0")
+						}
+						f.err = CorruptInputError(f.roffset)
+						return
+					}
+					fb = fb >> (n & regSizeMaskUint32)
+					fnb = fnb - n
+					dist = uint32(chunk >> huffmanValueShift)
+					break
+				}
+			}
+		}
+
+		switch {
+		case dist < 4:
+			dist++
+		case dist < maxNumDist:
+			nb := uint(dist-2) >> 1
+			// have 1 bit in bottom of dist, need nb more.
+			extra := (dist & 1) << (nb & regSizeMaskUint32)
+			for fnb < nb {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
+					if debugDecode {
+						fmt.Println("morebits f.nb<nb:", err)
+					}
+					f.err = err
+					return
+				}
+				f.roffset++
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
+			}
+			extra |= fb & bitMask32[nb]
+			fb >>= nb & regSizeMaskUint32
+			fnb -= nb
+			dist = 1<<((nb+1)&regSizeMaskUint32) + 1 + extra
+			// slower: dist = bitMask32[nb+1] + 2 + extra
+		default:
+			f.b, f.nb = fb, fnb
+			if debugDecode {
+				fmt.Println("dist too big:", dist, maxNumDist)
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		// No check on length; encoding can be prescient.
+		if dist > uint32(dict.histSize()) {
+			f.b, f.nb = fb, fnb
+			if debugDecode {
+				fmt.Println("dist > dict.histSize():", dist, dict.histSize())
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		f.copyLen, f.copyDist = length, int(dist)
+		goto copyHistory
+	}
+
+copyHistory:
+	// Perform a backwards copy according to RFC section 3.2.3.
+	{
+		cnt := dict.tryWriteCopy(f.copyDist, f.copyLen)
+		if cnt == 0 {
+			cnt = dict.writeCopy(f.copyDist, f.copyLen)
+		}
+		f.copyLen -= cnt
+
+		if dict.availWrite() == 0 || f.copyLen > 0 {
+			f.toRead = dict.readFlush()
+			f.step = (*decompressor).huffmanBytesReader // We need to continue this work
+			f.stepState = stateDict
+			f.b, f.nb = fb, fnb
+			return
+		}
+		goto readLiteral
+	}
+	// Not reached
+}
+
+// Decode a single Huffman block from f.
+// hl and hd are the Huffman states for the lit/length values
+// and the distance values, respectively. If hd == nil, using the
+// fixed distance encoding associated with fixed Huffman blocks.
+func (f *decompressor) huffmanBufioReader() {
+	const (
+		stateInit = iota // Zero value must be stateInit
+		stateDict
+	)
+	fr := f.r.(*bufio.Reader)
+
+	// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+	// but is smart enough to keep local variables in registers, so use nb and b,
+	// inline call to moreBits and reassign b,nb back to f on return.
+	fnb, fb, dict := f.nb, f.b, &f.dict
+
+	switch f.stepState {
+	case stateInit:
+		goto readLiteral
+	case stateDict:
+		goto copyHistory
+	}
+
+readLiteral:
+	// Read literal and/or (length, distance) according to RFC section 3.2.3.
+	{
+		var v int
+		{
+			// Inlined v, err := f.huffSym(f.hl)
+			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
+			// with single element, huffSym must error on these two edge cases. In both
+			// cases, the chunks slice will be 0 for the invalid sequence, leading it
+			// satisfy the n == 0 check below.
+			n := uint(f.hl.maxRead)
+			for {
+				for fnb < n {
+					c, err := fr.ReadByte()
+					if err != nil {
+						f.b, f.nb = fb, fnb
+						f.err = noEOF(err)
+						return
+					}
+					f.roffset++
+					fb |= uint32(c) << (fnb & regSizeMaskUint32)
+					fnb += 8
+				}
+				chunk := f.hl.chunks[fb&(huffmanNumChunks-1)]
+				n = uint(chunk & huffmanCountMask)
+				if n > huffmanChunkBits {
+					chunk = f.hl.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hl.linkMask]
+					n = uint(chunk & huffmanCountMask)
+				}
+				if n <= fnb {
+					if n == 0 {
+						f.b, f.nb = fb, fnb
+						if debugDecode {
+							fmt.Println("huffsym: n==0")
+						}
+						f.err = CorruptInputError(f.roffset)
+						return
+					}
+					fb = fb >> (n & regSizeMaskUint32)
+					fnb = fnb - n
+					v = int(chunk >> huffmanValueShift)
+					break
+				}
+			}
+		}
+
+		var length int
+		switch {
+		case v < 256:
+			dict.writeByte(byte(v))
+			if dict.availWrite() == 0 {
+				f.toRead = dict.readFlush()
+				f.step = (*decompressor).huffmanBufioReader
+				f.stepState = stateInit
+				f.b, f.nb = fb, fnb
+				return
+			}
+			goto readLiteral
+		case v == 256:
+			f.b, f.nb = fb, fnb
+			f.finishBlock()
+			return
+		// otherwise, reference to older data
+		case v < 265:
+			length = v - (257 - 3)
+		case v < maxNumLit:
+			val := decCodeToLen[(v - 257)]
+			length = int(val.length) + 3
+			n := uint(val.extra)
+			for fnb < n {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
+					if debugDecode {
+						fmt.Println("morebits n>0:", err)
+					}
+					f.err = err
+					return
+				}
+				f.roffset++
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
+			}
+			length += int(fb & bitMask32[n])
+			fb >>= n & regSizeMaskUint32
+			fnb -= n
+		default:
+			if debugDecode {
+				fmt.Println(v, ">= maxNumLit")
+			}
+			f.err = CorruptInputError(f.roffset)
+			f.b, f.nb = fb, fnb
+			return
+		}
+
+		var dist uint32
+		if f.hd == nil {
+			for fnb < 5 {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
+					if debugDecode {
+						fmt.Println("morebits f.nb<5:", err)
+					}
+					f.err = err
+					return
+				}
+				f.roffset++
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
+			}
+			dist = uint32(bits.Reverse8(uint8(fb & 0x1F << 3)))
+			fb >>= 5
+			fnb -= 5
+		} else {
+			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
+			// with single element, huffSym must error on these two edge cases. In both
+			// cases, the chunks slice will be 0 for the invalid sequence, leading it
+			// satisfy the n == 0 check below.
+			n := uint(f.hd.maxRead)
+			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+			// but is smart enough to keep local variables in registers, so use nb and b,
+			// inline call to moreBits and reassign b,nb back to f on return.
+			for {
+				for fnb < n {
+					c, err := fr.ReadByte()
+					if err != nil {
+						f.b, f.nb = fb, fnb
+						f.err = noEOF(err)
+						return
+					}
+					f.roffset++
+					fb |= uint32(c) << (fnb & regSizeMaskUint32)
+					fnb += 8
+				}
+				chunk := f.hd.chunks[fb&(huffmanNumChunks-1)]
+				n = uint(chunk & huffmanCountMask)
+				if n > huffmanChunkBits {
+					chunk = f.hd.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hd.linkMask]
+					n = uint(chunk & huffmanCountMask)
+				}
+				if n <= fnb {
+					if n == 0 {
+						f.b, f.nb = fb, fnb
+						if debugDecode {
+							fmt.Println("huffsym: n==0")
+						}
+						f.err = CorruptInputError(f.roffset)
+						return
+					}
+					fb = fb >> (n & regSizeMaskUint32)
+					fnb = fnb - n
+					dist = uint32(chunk >> huffmanValueShift)
+					break
+				}
+			}
+		}
+
+		switch {
+		case dist < 4:
+			dist++
+		case dist < maxNumDist:
+			nb := uint(dist-2) >> 1
+			// have 1 bit in bottom of dist, need nb more.
+			extra := (dist & 1) << (nb & regSizeMaskUint32)
+			for fnb < nb {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
+					if debugDecode {
+						fmt.Println("morebits f.nb<nb:", err)
+					}
+					f.err = err
+					return
+				}
+				f.roffset++
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
+			}
+			extra |= fb & bitMask32[nb]
+			fb >>= nb & regSizeMaskUint32
+			fnb -= nb
+			dist = 1<<((nb+1)&regSizeMaskUint32) + 1 + extra
+			// slower: dist = bitMask32[nb+1] + 2 + extra
+		default:
+			f.b, f.nb = fb, fnb
+			if debugDecode {
+				fmt.Println("dist too big:", dist, maxNumDist)
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		// No check on length; encoding can be prescient.
+		if dist > uint32(dict.histSize()) {
+			f.b, f.nb = fb, fnb
+			if debugDecode {
+				fmt.Println("dist > dict.histSize():", dist, dict.histSize())
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		f.copyLen, f.copyDist = length, int(dist)
+		goto copyHistory
+	}
+
+copyHistory:
+	// Perform a backwards copy according to RFC section 3.2.3.
+	{
+		cnt := dict.tryWriteCopy(f.copyDist, f.copyLen)
+		if cnt == 0 {
+			cnt = dict.writeCopy(f.copyDist, f.copyLen)
+		}
+		f.copyLen -= cnt
+
+		if dict.availWrite() == 0 || f.copyLen > 0 {
+			f.toRead = dict.readFlush()
+			f.step = (*decompressor).huffmanBufioReader // We need to continue this work
+			f.stepState = stateDict
+			f.b, f.nb = fb, fnb
+			return
+		}
+		goto readLiteral
+	}
+	// Not reached
+}
+
+// Decode a single Huffman block from f.
+// hl and hd are the Huffman states for the lit/length values
+// and the distance values, respectively. If hd == nil, using the
+// fixed distance encoding associated with fixed Huffman blocks.
+func (f *decompressor) huffmanStringsReader() {
+	const (
+		stateInit = iota // Zero value must be stateInit
+		stateDict
+	)
+	fr := f.r.(*strings.Reader)
+
+	// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+	// but is smart enough to keep local variables in registers, so use nb and b,
+	// inline call to moreBits and reassign b,nb back to f on return.
+	fnb, fb, dict := f.nb, f.b, &f.dict
+
+	switch f.stepState {
+	case stateInit:
+		goto readLiteral
+	case stateDict:
+		goto copyHistory
+	}
+
+readLiteral:
+	// Read literal and/or (length, distance) according to RFC section 3.2.3.
+	{
+		var v int
+		{
+			// Inlined v, err := f.huffSym(f.hl)
+			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
+			// with single element, huffSym must error on these two edge cases. In both
+			// cases, the chunks slice will be 0 for the invalid sequence, leading it
+			// satisfy the n == 0 check below.
+			n := uint(f.hl.maxRead)
+			for {
+				for fnb < n {
+					c, err := fr.ReadByte()
+					if err != nil {
+						f.b, f.nb = fb, fnb
+						f.err = noEOF(err)
+						return
+					}
+					f.roffset++
+					fb |= uint32(c) << (fnb & regSizeMaskUint32)
+					fnb += 8
+				}
+				chunk := f.hl.chunks[fb&(huffmanNumChunks-1)]
+				n = uint(chunk & huffmanCountMask)
+				if n > huffmanChunkBits {
+					chunk = f.hl.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hl.linkMask]
+					n = uint(chunk & huffmanCountMask)
+				}
+				if n <= fnb {
+					if n == 0 {
+						f.b, f.nb = fb, fnb
+						if debugDecode {
+							fmt.Println("huffsym: n==0")
+						}
+						f.err = CorruptInputError(f.roffset)
+						return
+					}
+					fb = fb >> (n & regSizeMaskUint32)
+					fnb = fnb - n
+					v = int(chunk >> huffmanValueShift)
+					break
+				}
+			}
+		}
+
+		var length int
+		switch {
+		case v < 256:
+			dict.writeByte(byte(v))
+			if dict.availWrite() == 0 {
+				f.toRead = dict.readFlush()
+				f.step = (*decompressor).huffmanStringsReader
+				f.stepState = stateInit
+				f.b, f.nb = fb, fnb
+				return
+			}
+			goto readLiteral
+		case v == 256:
+			f.b, f.nb = fb, fnb
+			f.finishBlock()
+			return
+		// otherwise, reference to older data
+		case v < 265:
+			length = v - (257 - 3)
+		case v < maxNumLit:
+			val := decCodeToLen[(v - 257)]
+			length = int(val.length) + 3
+			n := uint(val.extra)
+			for fnb < n {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
+					if debugDecode {
+						fmt.Println("morebits n>0:", err)
+					}
+					f.err = err
+					return
+				}
+				f.roffset++
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
+			}
+			length += int(fb & bitMask32[n])
+			fb >>= n & regSizeMaskUint32
+			fnb -= n
+		default:
+			if debugDecode {
+				fmt.Println(v, ">= maxNumLit")
+			}
+			f.err = CorruptInputError(f.roffset)
+			f.b, f.nb = fb, fnb
+			return
+		}
+
+		var dist uint32
+		if f.hd == nil {
+			for fnb < 5 {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
+					if debugDecode {
+						fmt.Println("morebits f.nb<5:", err)
+					}
+					f.err = err
+					return
+				}
+				f.roffset++
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
+			}
+			dist = uint32(bits.Reverse8(uint8(fb & 0x1F << 3)))
+			fb >>= 5
+			fnb -= 5
+		} else {
+			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
+			// with single element, huffSym must error on these two edge cases. In both
+			// cases, the chunks slice will be 0 for the invalid sequence, leading it
+			// satisfy the n == 0 check below.
+			n := uint(f.hd.maxRead)
+			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+			// but is smart enough to keep local variables in registers, so use nb and b,
+			// inline call to moreBits and reassign b,nb back to f on return.
+			for {
+				for fnb < n {
+					c, err := fr.ReadByte()
+					if err != nil {
+						f.b, f.nb = fb, fnb
+						f.err = noEOF(err)
+						return
+					}
+					f.roffset++
+					fb |= uint32(c) << (fnb & regSizeMaskUint32)
+					fnb += 8
+				}
+				chunk := f.hd.chunks[fb&(huffmanNumChunks-1)]
+				n = uint(chunk & huffmanCountMask)
+				if n > huffmanChunkBits {
+					chunk = f.hd.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hd.linkMask]
+					n = uint(chunk & huffmanCountMask)
+				}
+				if n <= fnb {
+					if n == 0 {
+						f.b, f.nb = fb, fnb
+						if debugDecode {
+							fmt.Println("huffsym: n==0")
+						}
+						f.err = CorruptInputError(f.roffset)
+						return
+					}
+					fb = fb >> (n & regSizeMaskUint32)
+					fnb = fnb - n
+					dist = uint32(chunk >> huffmanValueShift)
+					break
+				}
+			}
+		}
+
+		switch {
+		case dist < 4:
+			dist++
+		case dist < maxNumDist:
+			nb := uint(dist-2) >> 1
+			// have 1 bit in bottom of dist, need nb more.
+			extra := (dist & 1) << (nb & regSizeMaskUint32)
+			for fnb < nb {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
+					if debugDecode {
+						fmt.Println("morebits f.nb<nb:", err)
+					}
+					f.err = err
+					return
+				}
+				f.roffset++
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
+			}
+			extra |= fb & bitMask32[nb]
+			fb >>= nb & regSizeMaskUint32
+			fnb -= nb
+			dist = 1<<((nb+1)&regSizeMaskUint32) + 1 + extra
+			// slower: dist = bitMask32[nb+1] + 2 + extra
+		default:
+			f.b, f.nb = fb, fnb
+			if debugDecode {
+				fmt.Println("dist too big:", dist, maxNumDist)
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		// No check on length; encoding can be prescient.
+		if dist > uint32(dict.histSize()) {
+			f.b, f.nb = fb, fnb
+			if debugDecode {
+				fmt.Println("dist > dict.histSize():", dist, dict.histSize())
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		f.copyLen, f.copyDist = length, int(dist)
+		goto copyHistory
+	}
+
+copyHistory:
+	// Perform a backwards copy according to RFC section 3.2.3.
+	{
+		cnt := dict.tryWriteCopy(f.copyDist, f.copyLen)
+		if cnt == 0 {
+			cnt = dict.writeCopy(f.copyDist, f.copyLen)
+		}
+		f.copyLen -= cnt
+
+		if dict.availWrite() == 0 || f.copyLen > 0 {
+			f.toRead = dict.readFlush()
+			f.step = (*decompressor).huffmanStringsReader // We need to continue this work
+			f.stepState = stateDict
+			f.b, f.nb = fb, fnb
+			return
+		}
+		goto readLiteral
+	}
+	// Not reached
+}
+
+// Decode a single Huffman block from f.
+// hl and hd are the Huffman states for the lit/length values
+// and the distance values, respectively. If hd == nil, using the
+// fixed distance encoding associated with fixed Huffman blocks.
+func (f *decompressor) huffmanGenericReader() {
+	const (
+		stateInit = iota // Zero value must be stateInit
+		stateDict
+	)
+	fr := f.r.(Reader)
+
+	// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+	// but is smart enough to keep local variables in registers, so use nb and b,
+	// inline call to moreBits and reassign b,nb back to f on return.
+	fnb, fb, dict := f.nb, f.b, &f.dict
+
+	switch f.stepState {
+	case stateInit:
+		goto readLiteral
+	case stateDict:
+		goto copyHistory
+	}
+
+readLiteral:
+	// Read literal and/or (length, distance) according to RFC section 3.2.3.
+	{
+		var v int
+		{
+			// Inlined v, err := f.huffSym(f.hl)
+			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
+			// with single element, huffSym must error on these two edge cases. In both
+			// cases, the chunks slice will be 0 for the invalid sequence, leading it
+			// satisfy the n == 0 check below.
+			n := uint(f.hl.maxRead)
+			for {
+				for fnb < n {
+					c, err := fr.ReadByte()
+					if err != nil {
+						f.b, f.nb = fb, fnb
+						f.err = noEOF(err)
+						return
+					}
+					f.roffset++
+					fb |= uint32(c) << (fnb & regSizeMaskUint32)
+					fnb += 8
+				}
+				chunk := f.hl.chunks[fb&(huffmanNumChunks-1)]
+				n = uint(chunk & huffmanCountMask)
+				if n > huffmanChunkBits {
+					chunk = f.hl.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hl.linkMask]
+					n = uint(chunk & huffmanCountMask)
+				}
+				if n <= fnb {
+					if n == 0 {
+						f.b, f.nb = fb, fnb
+						if debugDecode {
+							fmt.Println("huffsym: n==0")
+						}
+						f.err = CorruptInputError(f.roffset)
+						return
+					}
+					fb = fb >> (n & regSizeMaskUint32)
+					fnb = fnb - n
+					v = int(chunk >> huffmanValueShift)
+					break
+				}
+			}
+		}
+
+		var length int
+		switch {
+		case v < 256:
+			dict.writeByte(byte(v))
+			if dict.availWrite() == 0 {
+				f.toRead = dict.readFlush()
+				f.step = (*decompressor).huffmanGenericReader
+				f.stepState = stateInit
+				f.b, f.nb = fb, fnb
+				return
+			}
+			goto readLiteral
+		case v == 256:
+			f.b, f.nb = fb, fnb
+			f.finishBlock()
+			return
+		// otherwise, reference to older data
+		case v < 265:
+			length = v - (257 - 3)
+		case v < maxNumLit:
+			val := decCodeToLen[(v - 257)]
+			length = int(val.length) + 3
+			n := uint(val.extra)
+			for fnb < n {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
+					if debugDecode {
+						fmt.Println("morebits n>0:", err)
+					}
+					f.err = err
+					return
+				}
+				f.roffset++
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
+			}
+			length += int(fb & bitMask32[n])
+			fb >>= n & regSizeMaskUint32
+			fnb -= n
+		default:
+			if debugDecode {
+				fmt.Println(v, ">= maxNumLit")
+			}
+			f.err = CorruptInputError(f.roffset)
+			f.b, f.nb = fb, fnb
+			return
+		}
+
+		var dist uint32
+		if f.hd == nil {
+			for fnb < 5 {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
+					if debugDecode {
+						fmt.Println("morebits f.nb<5:", err)
+					}
+					f.err = err
+					return
+				}
+				f.roffset++
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
+			}
+			dist = uint32(bits.Reverse8(uint8(fb & 0x1F << 3)))
+			fb >>= 5
+			fnb -= 5
+		} else {
+			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
+			// with single element, huffSym must error on these two edge cases. In both
+			// cases, the chunks slice will be 0 for the invalid sequence, leading it
+			// satisfy the n == 0 check below.
+			n := uint(f.hd.maxRead)
+			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+			// but is smart enough to keep local variables in registers, so use nb and b,
+			// inline call to moreBits and reassign b,nb back to f on return.
+			for {
+				for fnb < n {
+					c, err := fr.ReadByte()
+					if err != nil {
+						f.b, f.nb = fb, fnb
+						f.err = noEOF(err)
+						return
+					}
+					f.roffset++
+					fb |= uint32(c) << (fnb & regSizeMaskUint32)
+					fnb += 8
+				}
+				chunk := f.hd.chunks[fb&(huffmanNumChunks-1)]
+				n = uint(chunk & huffmanCountMask)
+				if n > huffmanChunkBits {
+					chunk = f.hd.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hd.linkMask]
+					n = uint(chunk & huffmanCountMask)
+				}
+				if n <= fnb {
+					if n == 0 {
+						f.b, f.nb = fb, fnb
+						if debugDecode {
+							fmt.Println("huffsym: n==0")
+						}
+						f.err = CorruptInputError(f.roffset)
+						return
+					}
+					fb = fb >> (n & regSizeMaskUint32)
+					fnb = fnb - n
+					dist = uint32(chunk >> huffmanValueShift)
+					break
+				}
+			}
+		}
+
+		switch {
+		case dist < 4:
+			dist++
+		case dist < maxNumDist:
+			nb := uint(dist-2) >> 1
+			// have 1 bit in bottom of dist, need nb more.
+			extra := (dist & 1) << (nb & regSizeMaskUint32)
+			for fnb < nb {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
+					if debugDecode {
+						fmt.Println("morebits f.nb<nb:", err)
+					}
+					f.err = err
+					return
+				}
+				f.roffset++
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
+			}
+			extra |= fb & bitMask32[nb]
+			fb >>= nb & regSizeMaskUint32
+			fnb -= nb
+			dist = 1<<((nb+1)&regSizeMaskUint32) + 1 + extra
+			// slower: dist = bitMask32[nb+1] + 2 + extra
+		default:
+			f.b, f.nb = fb, fnb
+			if debugDecode {
+				fmt.Println("dist too big:", dist, maxNumDist)
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		// No check on length; encoding can be prescient.
+		if dist > uint32(dict.histSize()) {
+			f.b, f.nb = fb, fnb
+			if debugDecode {
+				fmt.Println("dist > dict.histSize():", dist, dict.histSize())
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		f.copyLen, f.copyDist = length, int(dist)
+		goto copyHistory
+	}
+
+copyHistory:
+	// Perform a backwards copy according to RFC section 3.2.3.
+	{
+		cnt := dict.tryWriteCopy(f.copyDist, f.copyLen)
+		if cnt == 0 {
+			cnt = dict.writeCopy(f.copyDist, f.copyLen)
+		}
+		f.copyLen -= cnt
+
+		if dict.availWrite() == 0 || f.copyLen > 0 {
+			f.toRead = dict.readFlush()
+			f.step = (*decompressor).huffmanGenericReader // We need to continue this work
+			f.stepState = stateDict
+			f.b, f.nb = fb, fnb
+			return
+		}
+		goto readLiteral
+	}
+	// Not reached
+}
+
+func (f *decompressor) huffmanBlockDecoder() func() {
+	switch f.r.(type) {
+	case *bytes.Buffer:
+		return f.huffmanBytesBuffer
+	case *bytes.Reader:
+		return f.huffmanBytesReader
+	case *bufio.Reader:
+		return f.huffmanBufioReader
+	case *strings.Reader:
+		return f.huffmanStringsReader
+	case Reader:
+		return f.huffmanGenericReader
+	default:
+		return f.huffmanGenericReader
+	}
+}
diff --git a/vendor/github.com/klauspost/compress/flate/level1.go b/vendor/github.com/klauspost/compress/flate/level1.go
new file mode 100644
index 000000000..0f14f8d63
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/level1.go
@@ -0,0 +1,240 @@
+package flate
+
+import (
+	"encoding/binary"
+	"fmt"
+	"math/bits"
+)
+
+// fastGen maintains the table for matches,
+// and the previous byte block for level 2.
+// This is the generic implementation.
+type fastEncL1 struct {
+	fastGen
+	table [tableSize]tableEntry
+}
+
+// EncodeL1 uses a similar algorithm to level 1
+func (e *fastEncL1) Encode(dst *tokens, src []byte) {
+	const (
+		inputMargin            = 12 - 1
+		minNonLiteralBlockSize = 1 + 1 + inputMargin
+	)
+	if debugDeflate && e.cur < 0 {
+		panic(fmt.Sprint("e.cur < 0: ", e.cur))
+	}
+
+	// Protect against e.cur wraparound.
+	for e.cur >= bufferReset {
+		if len(e.hist) == 0 {
+			for i := range e.table[:] {
+				e.table[i] = tableEntry{}
+			}
+			e.cur = maxMatchOffset
+			break
+		}
+		// Shift down everything in the table that isn't already too far away.
+		minOff := e.cur + int32(len(e.hist)) - maxMatchOffset
+		for i := range e.table[:] {
+			v := e.table[i].offset
+			if v <= minOff {
+				v = 0
+			} else {
+				v = v - e.cur + maxMatchOffset
+			}
+			e.table[i].offset = v
+		}
+		e.cur = maxMatchOffset
+	}
+
+	s := e.addBlock(src)
+
+	// This check isn't in the Snappy implementation, but there, the caller
+	// instead of the callee handles this case.
+	if len(src) < minNonLiteralBlockSize {
+		// We do not fill the token table.
+		// This will be picked up by caller.
+		dst.n = uint16(len(src))
+		return
+	}
+
+	// Override src
+	src = e.hist
+	nextEmit := s
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := int32(len(src) - inputMargin)
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	cv := load3232(src, s)
+
+	for {
+		const skipLog = 5
+		const doEvery = 2
+
+		nextS := s
+		var candidate tableEntry
+		for {
+			nextHash := hash(cv)
+			candidate = e.table[nextHash]
+			nextS = s + doEvery + (s-nextEmit)>>skipLog
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+
+			now := load6432(src, nextS)
+			e.table[nextHash] = tableEntry{offset: s + e.cur}
+			nextHash = hash(uint32(now))
+
+			offset := s - (candidate.offset - e.cur)
+			if offset < maxMatchOffset && cv == load3232(src, candidate.offset-e.cur) {
+				e.table[nextHash] = tableEntry{offset: nextS + e.cur}
+				break
+			}
+
+			// Do one right away...
+			cv = uint32(now)
+			s = nextS
+			nextS++
+			candidate = e.table[nextHash]
+			now >>= 8
+			e.table[nextHash] = tableEntry{offset: s + e.cur}
+
+			offset = s - (candidate.offset - e.cur)
+			if offset < maxMatchOffset && cv == load3232(src, candidate.offset-e.cur) {
+				e.table[nextHash] = tableEntry{offset: nextS + e.cur}
+				break
+			}
+			cv = uint32(now)
+			s = nextS
+		}
+
+		// A 4-byte match has been found. We'll later see if more than 4 bytes
+		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+		// them as literal bytes.
+		for {
+			// Invariant: we have a 4-byte match at s, and no need to emit any
+			// literal bytes prior to s.
+
+			// Extend the 4-byte match as long as possible.
+			t := candidate.offset - e.cur
+			var l = int32(4)
+			if false {
+				l = e.matchlenLong(s+4, t+4, src) + 4
+			} else {
+				// inlined:
+				a := src[s+4:]
+				b := src[t+4:]
+				for len(a) >= 8 {
+					if diff := binary.LittleEndian.Uint64(a) ^ binary.LittleEndian.Uint64(b); diff != 0 {
+						l += int32(bits.TrailingZeros64(diff) >> 3)
+						break
+					}
+					l += 8
+					a = a[8:]
+					b = b[8:]
+				}
+				if len(a) < 8 {
+					b = b[:len(a)]
+					for i := range a {
+						if a[i] != b[i] {
+							break
+						}
+						l++
+					}
+				}
+			}
+
+			// Extend backwards
+			for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
+				s--
+				t--
+				l++
+			}
+			if nextEmit < s {
+				if false {
+					emitLiteral(dst, src[nextEmit:s])
+				} else {
+					for _, v := range src[nextEmit:s] {
+						dst.tokens[dst.n] = token(v)
+						dst.litHist[v]++
+						dst.n++
+					}
+				}
+			}
+
+			// Save the match found
+			if false {
+				dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
+			} else {
+				// Inlined...
+				xoffset := uint32(s - t - baseMatchOffset)
+				xlength := l
+				oc := offsetCode(xoffset)
+				xoffset |= oc << 16
+				for xlength > 0 {
+					xl := xlength
+					if xl > 258 {
+						if xl > 258+baseMatchLength {
+							xl = 258
+						} else {
+							xl = 258 - baseMatchLength
+						}
+					}
+					xlength -= xl
+					xl -= baseMatchLength
+					dst.extraHist[lengthCodes1[uint8(xl)]]++
+					dst.offHist[oc]++
+					dst.tokens[dst.n] = token(matchType | uint32(xl)<<lengthShift | xoffset)
+					dst.n++
+				}
+			}
+			s += l
+			nextEmit = s
+			if nextS >= s {
+				s = nextS + 1
+			}
+			if s >= sLimit {
+				// Index first pair after match end.
+				if int(s+l+4) < len(src) {
+					cv := load3232(src, s)
+					e.table[hash(cv)] = tableEntry{offset: s + e.cur}
+				}
+				goto emitRemainder
+			}
+
+			// We could immediately start working at s now, but to improve
+			// compression we first update the hash table at s-2 and at s. If
+			// another emitCopy is not our next move, also calculate nextHash
+			// at s+1. At least on GOARCH=amd64, these three hash calculations
+			// are faster as one load64 call (with some shifts) instead of
+			// three load32 calls.
+			x := load6432(src, s-2)
+			o := e.cur + s - 2
+			prevHash := hash(uint32(x))
+			e.table[prevHash] = tableEntry{offset: o}
+			x >>= 16
+			currHash := hash(uint32(x))
+			candidate = e.table[currHash]
+			e.table[currHash] = tableEntry{offset: o + 2}
+
+			offset := s - (candidate.offset - e.cur)
+			if offset > maxMatchOffset || uint32(x) != load3232(src, candidate.offset-e.cur) {
+				cv = uint32(x >> 8)
+				s++
+				break
+			}
+		}
+	}
+
+emitRemainder:
+	if int(nextEmit) < len(src) {
+		// If nothing was added, don't encode literals.
+		if dst.n == 0 {
+			return
+		}
+		emitLiteral(dst, src[nextEmit:])
+	}
+}
diff --git a/vendor/github.com/klauspost/compress/flate/level2.go b/vendor/github.com/klauspost/compress/flate/level2.go
new file mode 100644
index 000000000..8603fbd55
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/level2.go
@@ -0,0 +1,213 @@
+package flate
+
+import "fmt"
+
+// fastGen maintains the table for matches,
+// and the previous byte block for level 2.
+// This is the generic implementation.
+type fastEncL2 struct {
+	fastGen
+	table [bTableSize]tableEntry
+}
+
+// EncodeL2 uses a similar algorithm to level 1, but is capable
+// of matching across blocks giving better compression at a small slowdown.
+func (e *fastEncL2) Encode(dst *tokens, src []byte) {
+	const (
+		inputMargin            = 12 - 1
+		minNonLiteralBlockSize = 1 + 1 + inputMargin
+	)
+
+	if debugDeflate && e.cur < 0 {
+		panic(fmt.Sprint("e.cur < 0: ", e.cur))
+	}
+
+	// Protect against e.cur wraparound.
+	for e.cur >= bufferReset {
+		if len(e.hist) == 0 {
+			for i := range e.table[:] {
+				e.table[i] = tableEntry{}
+			}
+			e.cur = maxMatchOffset
+			break
+		}
+		// Shift down everything in the table that isn't already too far away.
+		minOff := e.cur + int32(len(e.hist)) - maxMatchOffset
+		for i := range e.table[:] {
+			v := e.table[i].offset
+			if v <= minOff {
+				v = 0
+			} else {
+				v = v - e.cur + maxMatchOffset
+			}
+			e.table[i].offset = v
+		}
+		e.cur = maxMatchOffset
+	}
+
+	s := e.addBlock(src)
+
+	// This check isn't in the Snappy implementation, but there, the caller
+	// instead of the callee handles this case.
+	if len(src) < minNonLiteralBlockSize {
+		// We do not fill the token table.
+		// This will be picked up by caller.
+		dst.n = uint16(len(src))
+		return
+	}
+
+	// Override src
+	src = e.hist
+	nextEmit := s
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := int32(len(src) - inputMargin)
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	cv := load3232(src, s)
+	for {
+		// When should we start skipping if we haven't found matches in a long while.
+		const skipLog = 5
+		const doEvery = 2
+
+		nextS := s
+		var candidate tableEntry
+		for {
+			nextHash := hash4u(cv, bTableBits)
+			s = nextS
+			nextS = s + doEvery + (s-nextEmit)>>skipLog
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			candidate = e.table[nextHash]
+			now := load6432(src, nextS)
+			e.table[nextHash] = tableEntry{offset: s + e.cur}
+			nextHash = hash4u(uint32(now), bTableBits)
+
+			offset := s - (candidate.offset - e.cur)
+			if offset < maxMatchOffset && cv == load3232(src, candidate.offset-e.cur) {
+				e.table[nextHash] = tableEntry{offset: nextS + e.cur}
+				break
+			}
+
+			// Do one right away...
+			cv = uint32(now)
+			s = nextS
+			nextS++
+			candidate = e.table[nextHash]
+			now >>= 8
+			e.table[nextHash] = tableEntry{offset: s + e.cur}
+
+			offset = s - (candidate.offset - e.cur)
+			if offset < maxMatchOffset && cv == load3232(src, candidate.offset-e.cur) {
+				break
+			}
+			cv = uint32(now)
+		}
+
+		// A 4-byte match has been found. We'll later see if more than 4 bytes
+		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+		// them as literal bytes.
+
+		// Call emitCopy, and then see if another emitCopy could be our next
+		// move. Repeat until we find no match for the input immediately after
+		// what was consumed by the last emitCopy call.
+		//
+		// If we exit this loop normally then we need to call emitLiteral next,
+		// though we don't yet know how big the literal will be. We handle that
+		// by proceeding to the next iteration of the main loop. We also can
+		// exit this loop via goto if we get close to exhausting the input.
+		for {
+			// Invariant: we have a 4-byte match at s, and no need to emit any
+			// literal bytes prior to s.
+
+			// Extend the 4-byte match as long as possible.
+			t := candidate.offset - e.cur
+			l := e.matchlenLong(s+4, t+4, src) + 4
+
+			// Extend backwards
+			for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
+				s--
+				t--
+				l++
+			}
+			if nextEmit < s {
+				if false {
+					emitLiteral(dst, src[nextEmit:s])
+				} else {
+					for _, v := range src[nextEmit:s] {
+						dst.tokens[dst.n] = token(v)
+						dst.litHist[v]++
+						dst.n++
+					}
+				}
+			}
+
+			dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
+			s += l
+			nextEmit = s
+			if nextS >= s {
+				s = nextS + 1
+			}
+
+			if s >= sLimit {
+				// Index first pair after match end.
+				if int(s+l+4) < len(src) {
+					cv := load3232(src, s)
+					e.table[hash4u(cv, bTableBits)] = tableEntry{offset: s + e.cur}
+				}
+				goto emitRemainder
+			}
+
+			// Store every second hash in-between, but offset by 1.
+			for i := s - l + 2; i < s-5; i += 7 {
+				x := load6432(src, i)
+				nextHash := hash4u(uint32(x), bTableBits)
+				e.table[nextHash] = tableEntry{offset: e.cur + i}
+				// Skip one
+				x >>= 16
+				nextHash = hash4u(uint32(x), bTableBits)
+				e.table[nextHash] = tableEntry{offset: e.cur + i + 2}
+				// Skip one
+				x >>= 16
+				nextHash = hash4u(uint32(x), bTableBits)
+				e.table[nextHash] = tableEntry{offset: e.cur + i + 4}
+			}
+
+			// We could immediately start working at s now, but to improve
+			// compression we first update the hash table at s-2 to s. If
+			// another emitCopy is not our next move, also calculate nextHash
+			// at s+1. At least on GOARCH=amd64, these three hash calculations
+			// are faster as one load64 call (with some shifts) instead of
+			// three load32 calls.
+			x := load6432(src, s-2)
+			o := e.cur + s - 2
+			prevHash := hash4u(uint32(x), bTableBits)
+			prevHash2 := hash4u(uint32(x>>8), bTableBits)
+			e.table[prevHash] = tableEntry{offset: o}
+			e.table[prevHash2] = tableEntry{offset: o + 1}
+			currHash := hash4u(uint32(x>>16), bTableBits)
+			candidate = e.table[currHash]
+			e.table[currHash] = tableEntry{offset: o + 2}
+
+			offset := s - (candidate.offset - e.cur)
+			if offset > maxMatchOffset || uint32(x>>16) != load3232(src, candidate.offset-e.cur) {
+				cv = uint32(x >> 24)
+				s++
+				break
+			}
+		}
+	}
+
+emitRemainder:
+	if int(nextEmit) < len(src) {
+		// If nothing was added, don't encode literals.
+		if dst.n == 0 {
+			return
+		}
+
+		emitLiteral(dst, src[nextEmit:])
+	}
+}
diff --git a/vendor/github.com/klauspost/compress/flate/level3.go b/vendor/github.com/klauspost/compress/flate/level3.go
new file mode 100644
index 000000000..039639f89
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/level3.go
@@ -0,0 +1,240 @@
+package flate
+
+import "fmt"
+
+// fastEncL3
+type fastEncL3 struct {
+	fastGen
+	table [1 << 16]tableEntryPrev
+}
+
+// Encode uses a similar algorithm to level 2, will check up to two candidates.
+func (e *fastEncL3) Encode(dst *tokens, src []byte) {
+	const (
+		inputMargin            = 8 - 1
+		minNonLiteralBlockSize = 1 + 1 + inputMargin
+		tableBits              = 16
+		tableSize              = 1 << tableBits
+	)
+
+	if debugDeflate && e.cur < 0 {
+		panic(fmt.Sprint("e.cur < 0: ", e.cur))
+	}
+
+	// Protect against e.cur wraparound.
+	for e.cur >= bufferReset {
+		if len(e.hist) == 0 {
+			for i := range e.table[:] {
+				e.table[i] = tableEntryPrev{}
+			}
+			e.cur = maxMatchOffset
+			break
+		}
+		// Shift down everything in the table that isn't already too far away.
+		minOff := e.cur + int32(len(e.hist)) - maxMatchOffset
+		for i := range e.table[:] {
+			v := e.table[i]
+			if v.Cur.offset <= minOff {
+				v.Cur.offset = 0
+			} else {
+				v.Cur.offset = v.Cur.offset - e.cur + maxMatchOffset
+			}
+			if v.Prev.offset <= minOff {
+				v.Prev.offset = 0
+			} else {
+				v.Prev.offset = v.Prev.offset - e.cur + maxMatchOffset
+			}
+			e.table[i] = v
+		}
+		e.cur = maxMatchOffset
+	}
+
+	s := e.addBlock(src)
+
+	// Skip if too small.
+	if len(src) < minNonLiteralBlockSize {
+		// We do not fill the token table.
+		// This will be picked up by caller.
+		dst.n = uint16(len(src))
+		return
+	}
+
+	// Override src
+	src = e.hist
+	nextEmit := s
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := int32(len(src) - inputMargin)
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	cv := load3232(src, s)
+	for {
+		const skipLog = 6
+		nextS := s
+		var candidate tableEntry
+		for {
+			nextHash := hash4u(cv, tableBits)
+			s = nextS
+			nextS = s + 1 + (s-nextEmit)>>skipLog
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			candidates := e.table[nextHash]
+			now := load3232(src, nextS)
+
+			// Safe offset distance until s + 4...
+			minOffset := e.cur + s - (maxMatchOffset - 4)
+			e.table[nextHash] = tableEntryPrev{Prev: candidates.Cur, Cur: tableEntry{offset: s + e.cur}}
+
+			// Check both candidates
+			candidate = candidates.Cur
+			if candidate.offset < minOffset {
+				cv = now
+				// Previous will also be invalid, we have nothing.
+				continue
+			}
+
+			if cv == load3232(src, candidate.offset-e.cur) {
+				if candidates.Prev.offset < minOffset || cv != load3232(src, candidates.Prev.offset-e.cur) {
+					break
+				}
+				// Both match and are valid, pick longest.
+				offset := s - (candidate.offset - e.cur)
+				o2 := s - (candidates.Prev.offset - e.cur)
+				l1, l2 := matchLen(src[s+4:], src[s-offset+4:]), matchLen(src[s+4:], src[s-o2+4:])
+				if l2 > l1 {
+					candidate = candidates.Prev
+				}
+				break
+			} else {
+				// We only check if value mismatches.
+				// Offset will always be invalid in other cases.
+				candidate = candidates.Prev
+				if candidate.offset > minOffset && cv == load3232(src, candidate.offset-e.cur) {
+					break
+				}
+			}
+			cv = now
+		}
+
+		// Call emitCopy, and then see if another emitCopy could be our next
+		// move. Repeat until we find no match for the input immediately after
+		// what was consumed by the last emitCopy call.
+		//
+		// If we exit this loop normally then we need to call emitLiteral next,
+		// though we don't yet know how big the literal will be. We handle that
+		// by proceeding to the next iteration of the main loop. We also can
+		// exit this loop via goto if we get close to exhausting the input.
+		for {
+			// Invariant: we have a 4-byte match at s, and no need to emit any
+			// literal bytes prior to s.
+
+			// Extend the 4-byte match as long as possible.
+			//
+			t := candidate.offset - e.cur
+			l := e.matchlenLong(s+4, t+4, src) + 4
+
+			// Extend backwards
+			for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
+				s--
+				t--
+				l++
+			}
+			if nextEmit < s {
+				if false {
+					emitLiteral(dst, src[nextEmit:s])
+				} else {
+					for _, v := range src[nextEmit:s] {
+						dst.tokens[dst.n] = token(v)
+						dst.litHist[v]++
+						dst.n++
+					}
+				}
+			}
+
+			dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
+			s += l
+			nextEmit = s
+			if nextS >= s {
+				s = nextS + 1
+			}
+
+			if s >= sLimit {
+				t += l
+				// Index first pair after match end.
+				if int(t+4) < len(src) && t > 0 {
+					cv := load3232(src, t)
+					nextHash := hash4u(cv, tableBits)
+					e.table[nextHash] = tableEntryPrev{
+						Prev: e.table[nextHash].Cur,
+						Cur:  tableEntry{offset: e.cur + t},
+					}
+				}
+				goto emitRemainder
+			}
+
+			// Store every 5th hash in-between.
+			for i := s - l + 2; i < s-5; i += 5 {
+				nextHash := hash4u(load3232(src, i), tableBits)
+				e.table[nextHash] = tableEntryPrev{
+					Prev: e.table[nextHash].Cur,
+					Cur:  tableEntry{offset: e.cur + i}}
+			}
+			// We could immediately start working at s now, but to improve
+			// compression we first update the hash table at s-2 to s.
+			x := load6432(src, s-2)
+			prevHash := hash4u(uint32(x), tableBits)
+
+			e.table[prevHash] = tableEntryPrev{
+				Prev: e.table[prevHash].Cur,
+				Cur:  tableEntry{offset: e.cur + s - 2},
+			}
+			x >>= 8
+			prevHash = hash4u(uint32(x), tableBits)
+
+			e.table[prevHash] = tableEntryPrev{
+				Prev: e.table[prevHash].Cur,
+				Cur:  tableEntry{offset: e.cur + s - 1},
+			}
+			x >>= 8
+			currHash := hash4u(uint32(x), tableBits)
+			candidates := e.table[currHash]
+			cv = uint32(x)
+			e.table[currHash] = tableEntryPrev{
+				Prev: candidates.Cur,
+				Cur:  tableEntry{offset: s + e.cur},
+			}
+
+			// Check both candidates
+			candidate = candidates.Cur
+			minOffset := e.cur + s - (maxMatchOffset - 4)
+
+			if candidate.offset > minOffset {
+				if cv == load3232(src, candidate.offset-e.cur) {
+					// Found a match...
+					continue
+				}
+				candidate = candidates.Prev
+				if candidate.offset > minOffset && cv == load3232(src, candidate.offset-e.cur) {
+					// Match at prev...
+					continue
+				}
+			}
+			cv = uint32(x >> 8)
+			s++
+			break
+		}
+	}
+
+emitRemainder:
+	if int(nextEmit) < len(src) {
+		// If nothing was added, don't encode literals.
+		if dst.n == 0 {
+			return
+		}
+
+		emitLiteral(dst, src[nextEmit:])
+	}
+}
diff --git a/vendor/github.com/klauspost/compress/flate/level4.go b/vendor/github.com/klauspost/compress/flate/level4.go
new file mode 100644
index 000000000..1cbffa1ae
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/level4.go
@@ -0,0 +1,220 @@
+package flate
+
+import "fmt"
+
+type fastEncL4 struct {
+	fastGen
+	table  [tableSize]tableEntry
+	bTable [tableSize]tableEntry
+}
+
+func (e *fastEncL4) Encode(dst *tokens, src []byte) {
+	const (
+		inputMargin            = 12 - 1
+		minNonLiteralBlockSize = 1 + 1 + inputMargin
+	)
+	if debugDeflate && e.cur < 0 {
+		panic(fmt.Sprint("e.cur < 0: ", e.cur))
+	}
+	// Protect against e.cur wraparound.
+	for e.cur >= bufferReset {
+		if len(e.hist) == 0 {
+			for i := range e.table[:] {
+				e.table[i] = tableEntry{}
+			}
+			for i := range e.bTable[:] {
+				e.bTable[i] = tableEntry{}
+			}
+			e.cur = maxMatchOffset
+			break
+		}
+		// Shift down everything in the table that isn't already too far away.
+		minOff := e.cur + int32(len(e.hist)) - maxMatchOffset
+		for i := range e.table[:] {
+			v := e.table[i].offset
+			if v <= minOff {
+				v = 0
+			} else {
+				v = v - e.cur + maxMatchOffset
+			}
+			e.table[i].offset = v
+		}
+		for i := range e.bTable[:] {
+			v := e.bTable[i].offset
+			if v <= minOff {
+				v = 0
+			} else {
+				v = v - e.cur + maxMatchOffset
+			}
+			e.bTable[i].offset = v
+		}
+		e.cur = maxMatchOffset
+	}
+
+	s := e.addBlock(src)
+
+	// This check isn't in the Snappy implementation, but there, the caller
+	// instead of the callee handles this case.
+	if len(src) < minNonLiteralBlockSize {
+		// We do not fill the token table.
+		// This will be picked up by caller.
+		dst.n = uint16(len(src))
+		return
+	}
+
+	// Override src
+	src = e.hist
+	nextEmit := s
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := int32(len(src) - inputMargin)
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	cv := load6432(src, s)
+	for {
+		const skipLog = 6
+		const doEvery = 1
+
+		nextS := s
+		var t int32
+		for {
+			nextHashS := hash4x64(cv, tableBits)
+			nextHashL := hash7(cv, tableBits)
+
+			s = nextS
+			nextS = s + doEvery + (s-nextEmit)>>skipLog
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			// Fetch a short+long candidate
+			sCandidate := e.table[nextHashS]
+			lCandidate := e.bTable[nextHashL]
+			next := load6432(src, nextS)
+			entry := tableEntry{offset: s + e.cur}
+			e.table[nextHashS] = entry
+			e.bTable[nextHashL] = entry
+
+			t = lCandidate.offset - e.cur
+			if s-t < maxMatchOffset && uint32(cv) == load3232(src, lCandidate.offset-e.cur) {
+				// We got a long match. Use that.
+				break
+			}
+
+			t = sCandidate.offset - e.cur
+			if s-t < maxMatchOffset && uint32(cv) == load3232(src, sCandidate.offset-e.cur) {
+				// Found a 4 match...
+				lCandidate = e.bTable[hash7(next, tableBits)]
+
+				// If the next long is a candidate, check if we should use that instead...
+				lOff := nextS - (lCandidate.offset - e.cur)
+				if lOff < maxMatchOffset && load3232(src, lCandidate.offset-e.cur) == uint32(next) {
+					l1, l2 := matchLen(src[s+4:], src[t+4:]), matchLen(src[nextS+4:], src[nextS-lOff+4:])
+					if l2 > l1 {
+						s = nextS
+						t = lCandidate.offset - e.cur
+					}
+				}
+				break
+			}
+			cv = next
+		}
+
+		// A 4-byte match has been found. We'll later see if more than 4 bytes
+		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+		// them as literal bytes.
+
+		// Extend the 4-byte match as long as possible.
+		l := e.matchlenLong(s+4, t+4, src) + 4
+
+		// Extend backwards
+		for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
+			s--
+			t--
+			l++
+		}
+		if nextEmit < s {
+			if false {
+				emitLiteral(dst, src[nextEmit:s])
+			} else {
+				for _, v := range src[nextEmit:s] {
+					dst.tokens[dst.n] = token(v)
+					dst.litHist[v]++
+					dst.n++
+				}
+			}
+		}
+		if debugDeflate {
+			if t >= s {
+				panic("s-t")
+			}
+			if (s - t) > maxMatchOffset {
+				panic(fmt.Sprintln("mmo", t))
+			}
+			if l < baseMatchLength {
+				panic("bml")
+			}
+		}
+
+		dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
+		s += l
+		nextEmit = s
+		if nextS >= s {
+			s = nextS + 1
+		}
+
+		if s >= sLimit {
+			// Index first pair after match end.
+			if int(s+8) < len(src) {
+				cv := load6432(src, s)
+				e.table[hash4x64(cv, tableBits)] = tableEntry{offset: s + e.cur}
+				e.bTable[hash7(cv, tableBits)] = tableEntry{offset: s + e.cur}
+			}
+			goto emitRemainder
+		}
+
+		// Store every 3rd hash in-between
+		if true {
+			i := nextS
+			if i < s-1 {
+				cv := load6432(src, i)
+				t := tableEntry{offset: i + e.cur}
+				t2 := tableEntry{offset: t.offset + 1}
+				e.bTable[hash7(cv, tableBits)] = t
+				e.bTable[hash7(cv>>8, tableBits)] = t2
+				e.table[hash4u(uint32(cv>>8), tableBits)] = t2
+
+				i += 3
+				for ; i < s-1; i += 3 {
+					cv := load6432(src, i)
+					t := tableEntry{offset: i + e.cur}
+					t2 := tableEntry{offset: t.offset + 1}
+					e.bTable[hash7(cv, tableBits)] = t
+					e.bTable[hash7(cv>>8, tableBits)] = t2
+					e.table[hash4u(uint32(cv>>8), tableBits)] = t2
+				}
+			}
+		}
+
+		// We could immediately start working at s now, but to improve
+		// compression we first update the hash table at s-1 and at s.
+		x := load6432(src, s-1)
+		o := e.cur + s - 1
+		prevHashS := hash4x64(x, tableBits)
+		prevHashL := hash7(x, tableBits)
+		e.table[prevHashS] = tableEntry{offset: o}
+		e.bTable[prevHashL] = tableEntry{offset: o}
+		cv = x >> 8
+	}
+
+emitRemainder:
+	if int(nextEmit) < len(src) {
+		// If nothing was added, don't encode literals.
+		if dst.n == 0 {
+			return
+		}
+
+		emitLiteral(dst, src[nextEmit:])
+	}
+}
diff --git a/vendor/github.com/klauspost/compress/flate/level5.go b/vendor/github.com/klauspost/compress/flate/level5.go
new file mode 100644
index 000000000..4b97576bd
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/level5.go
@@ -0,0 +1,302 @@
+package flate
+
+import "fmt"
+
+type fastEncL5 struct {
+	fastGen
+	table  [tableSize]tableEntry
+	bTable [tableSize]tableEntryPrev
+}
+
+func (e *fastEncL5) Encode(dst *tokens, src []byte) {
+	const (
+		inputMargin            = 12 - 1
+		minNonLiteralBlockSize = 1 + 1 + inputMargin
+	)
+	if debugDeflate && e.cur < 0 {
+		panic(fmt.Sprint("e.cur < 0: ", e.cur))
+	}
+
+	// Protect against e.cur wraparound.
+	for e.cur >= bufferReset {
+		if len(e.hist) == 0 {
+			for i := range e.table[:] {
+				e.table[i] = tableEntry{}
+			}
+			for i := range e.bTable[:] {
+				e.bTable[i] = tableEntryPrev{}
+			}
+			e.cur = maxMatchOffset
+			break
+		}
+		// Shift down everything in the table that isn't already too far away.
+		minOff := e.cur + int32(len(e.hist)) - maxMatchOffset
+		for i := range e.table[:] {
+			v := e.table[i].offset
+			if v <= minOff {
+				v = 0
+			} else {
+				v = v - e.cur + maxMatchOffset
+			}
+			e.table[i].offset = v
+		}
+		for i := range e.bTable[:] {
+			v := e.bTable[i]
+			if v.Cur.offset <= minOff {
+				v.Cur.offset = 0
+				v.Prev.offset = 0
+			} else {
+				v.Cur.offset = v.Cur.offset - e.cur + maxMatchOffset
+				if v.Prev.offset <= minOff {
+					v.Prev.offset = 0
+				} else {
+					v.Prev.offset = v.Prev.offset - e.cur + maxMatchOffset
+				}
+			}
+			e.bTable[i] = v
+		}
+		e.cur = maxMatchOffset
+	}
+
+	s := e.addBlock(src)
+
+	// This check isn't in the Snappy implementation, but there, the caller
+	// instead of the callee handles this case.
+	if len(src) < minNonLiteralBlockSize {
+		// We do not fill the token table.
+		// This will be picked up by caller.
+		dst.n = uint16(len(src))
+		return
+	}
+
+	// Override src
+	src = e.hist
+	nextEmit := s
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := int32(len(src) - inputMargin)
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	cv := load6432(src, s)
+	for {
+		const skipLog = 6
+		const doEvery = 1
+
+		nextS := s
+		var l int32
+		var t int32
+		for {
+			nextHashS := hash4x64(cv, tableBits)
+			nextHashL := hash7(cv, tableBits)
+
+			s = nextS
+			nextS = s + doEvery + (s-nextEmit)>>skipLog
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			// Fetch a short+long candidate
+			sCandidate := e.table[nextHashS]
+			lCandidate := e.bTable[nextHashL]
+			next := load6432(src, nextS)
+			entry := tableEntry{offset: s + e.cur}
+			e.table[nextHashS] = entry
+			eLong := &e.bTable[nextHashL]
+			eLong.Cur, eLong.Prev = entry, eLong.Cur
+
+			nextHashS = hash4x64(next, tableBits)
+			nextHashL = hash7(next, tableBits)
+
+			t = lCandidate.Cur.offset - e.cur
+			if s-t < maxMatchOffset {
+				if uint32(cv) == load3232(src, lCandidate.Cur.offset-e.cur) {
+					// Store the next match
+					e.table[nextHashS] = tableEntry{offset: nextS + e.cur}
+					eLong := &e.bTable[nextHashL]
+					eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur}, eLong.Cur
+
+					t2 := lCandidate.Prev.offset - e.cur
+					if s-t2 < maxMatchOffset && uint32(cv) == load3232(src, lCandidate.Prev.offset-e.cur) {
+						l = e.matchlen(s+4, t+4, src) + 4
+						ml1 := e.matchlen(s+4, t2+4, src) + 4
+						if ml1 > l {
+							t = t2
+							l = ml1
+							break
+						}
+					}
+					break
+				}
+				t = lCandidate.Prev.offset - e.cur
+				if s-t < maxMatchOffset && uint32(cv) == load3232(src, lCandidate.Prev.offset-e.cur) {
+					// Store the next match
+					e.table[nextHashS] = tableEntry{offset: nextS + e.cur}
+					eLong := &e.bTable[nextHashL]
+					eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur}, eLong.Cur
+					break
+				}
+			}
+
+			t = sCandidate.offset - e.cur
+			if s-t < maxMatchOffset && uint32(cv) == load3232(src, sCandidate.offset-e.cur) {
+				// Found a 4 match...
+				l = e.matchlen(s+4, t+4, src) + 4
+				lCandidate = e.bTable[nextHashL]
+				// Store the next match
+
+				e.table[nextHashS] = tableEntry{offset: nextS + e.cur}
+				eLong := &e.bTable[nextHashL]
+				eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur}, eLong.Cur
+
+				// If the next long is a candidate, use that...
+				t2 := lCandidate.Cur.offset - e.cur
+				if nextS-t2 < maxMatchOffset {
+					if load3232(src, lCandidate.Cur.offset-e.cur) == uint32(next) {
+						ml := e.matchlen(nextS+4, t2+4, src) + 4
+						if ml > l {
+							t = t2
+							s = nextS
+							l = ml
+							break
+						}
+					}
+					// If the previous long is a candidate, use that...
+					t2 = lCandidate.Prev.offset - e.cur
+					if nextS-t2 < maxMatchOffset && load3232(src, lCandidate.Prev.offset-e.cur) == uint32(next) {
+						ml := e.matchlen(nextS+4, t2+4, src) + 4
+						if ml > l {
+							t = t2
+							s = nextS
+							l = ml
+							break
+						}
+					}
+				}
+				break
+			}
+			cv = next
+		}
+
+		// A 4-byte match has been found. We'll later see if more than 4 bytes
+		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+		// them as literal bytes.
+
+		if l == 0 {
+			// Extend the 4-byte match as long as possible.
+			l = e.matchlenLong(s+4, t+4, src) + 4
+		} else if l == maxMatchLength {
+			l += e.matchlenLong(s+l, t+l, src)
+		}
+
+		// Try to locate a better match by checking the end of best match...
+		if sAt := s + l; l < 30 && sAt < sLimit {
+			eLong := e.bTable[hash7(load6432(src, sAt), tableBits)].Cur.offset
+			// Test current
+			t2 := eLong - e.cur - l
+			off := s - t2
+			if t2 >= 0 && off < maxMatchOffset && off > 0 {
+				if l2 := e.matchlenLong(s, t2, src); l2 > l {
+					t = t2
+					l = l2
+				}
+			}
+		}
+
+		// Extend backwards
+		for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
+			s--
+			t--
+			l++
+		}
+		if nextEmit < s {
+			if false {
+				emitLiteral(dst, src[nextEmit:s])
+			} else {
+				for _, v := range src[nextEmit:s] {
+					dst.tokens[dst.n] = token(v)
+					dst.litHist[v]++
+					dst.n++
+				}
+			}
+		}
+		if debugDeflate {
+			if t >= s {
+				panic(fmt.Sprintln("s-t", s, t))
+			}
+			if (s - t) > maxMatchOffset {
+				panic(fmt.Sprintln("mmo", s-t))
+			}
+			if l < baseMatchLength {
+				panic("bml")
+			}
+		}
+
+		dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
+		s += l
+		nextEmit = s
+		if nextS >= s {
+			s = nextS + 1
+		}
+
+		if s >= sLimit {
+			goto emitRemainder
+		}
+
+		// Store every 3rd hash in-between.
+		if true {
+			const hashEvery = 3
+			i := s - l + 1
+			if i < s-1 {
+				cv := load6432(src, i)
+				t := tableEntry{offset: i + e.cur}
+				e.table[hash4x64(cv, tableBits)] = t
+				eLong := &e.bTable[hash7(cv, tableBits)]
+				eLong.Cur, eLong.Prev = t, eLong.Cur
+
+				// Do an long at i+1
+				cv >>= 8
+				t = tableEntry{offset: t.offset + 1}
+				eLong = &e.bTable[hash7(cv, tableBits)]
+				eLong.Cur, eLong.Prev = t, eLong.Cur
+
+				// We only have enough bits for a short entry at i+2
+				cv >>= 8
+				t = tableEntry{offset: t.offset + 1}
+				e.table[hash4x64(cv, tableBits)] = t
+
+				// Skip one - otherwise we risk hitting 's'
+				i += 4
+				for ; i < s-1; i += hashEvery {
+					cv := load6432(src, i)
+					t := tableEntry{offset: i + e.cur}
+					t2 := tableEntry{offset: t.offset + 1}
+					eLong := &e.bTable[hash7(cv, tableBits)]
+					eLong.Cur, eLong.Prev = t, eLong.Cur
+					e.table[hash4u(uint32(cv>>8), tableBits)] = t2
+				}
+			}
+		}
+
+		// We could immediately start working at s now, but to improve
+		// compression we first update the hash table at s-1 and at s.
+		x := load6432(src, s-1)
+		o := e.cur + s - 1
+		prevHashS := hash4x64(x, tableBits)
+		prevHashL := hash7(x, tableBits)
+		e.table[prevHashS] = tableEntry{offset: o}
+		eLong := &e.bTable[prevHashL]
+		eLong.Cur, eLong.Prev = tableEntry{offset: o}, eLong.Cur
+		cv = x >> 8
+	}
+
+emitRemainder:
+	if int(nextEmit) < len(src) {
+		// If nothing was added, don't encode literals.
+		if dst.n == 0 {
+			return
+		}
+
+		emitLiteral(dst, src[nextEmit:])
+	}
+}
diff --git a/vendor/github.com/klauspost/compress/flate/level6.go b/vendor/github.com/klauspost/compress/flate/level6.go
new file mode 100644
index 000000000..62888edf3
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/level6.go
@@ -0,0 +1,315 @@
+package flate
+
+import "fmt"
+
+type fastEncL6 struct {
+	fastGen
+	table  [tableSize]tableEntry
+	bTable [tableSize]tableEntryPrev
+}
+
+func (e *fastEncL6) Encode(dst *tokens, src []byte) {
+	const (
+		inputMargin            = 12 - 1
+		minNonLiteralBlockSize = 1 + 1 + inputMargin
+	)
+	if debugDeflate && e.cur < 0 {
+		panic(fmt.Sprint("e.cur < 0: ", e.cur))
+	}
+
+	// Protect against e.cur wraparound.
+	for e.cur >= bufferReset {
+		if len(e.hist) == 0 {
+			for i := range e.table[:] {
+				e.table[i] = tableEntry{}
+			}
+			for i := range e.bTable[:] {
+				e.bTable[i] = tableEntryPrev{}
+			}
+			e.cur = maxMatchOffset
+			break
+		}
+		// Shift down everything in the table that isn't already too far away.
+		minOff := e.cur + int32(len(e.hist)) - maxMatchOffset
+		for i := range e.table[:] {
+			v := e.table[i].offset
+			if v <= minOff {
+				v = 0
+			} else {
+				v = v - e.cur + maxMatchOffset
+			}
+			e.table[i].offset = v
+		}
+		for i := range e.bTable[:] {
+			v := e.bTable[i]
+			if v.Cur.offset <= minOff {
+				v.Cur.offset = 0
+				v.Prev.offset = 0
+			} else {
+				v.Cur.offset = v.Cur.offset - e.cur + maxMatchOffset
+				if v.Prev.offset <= minOff {
+					v.Prev.offset = 0
+				} else {
+					v.Prev.offset = v.Prev.offset - e.cur + maxMatchOffset
+				}
+			}
+			e.bTable[i] = v
+		}
+		e.cur = maxMatchOffset
+	}
+
+	s := e.addBlock(src)
+
+	// This check isn't in the Snappy implementation, but there, the caller
+	// instead of the callee handles this case.
+	if len(src) < minNonLiteralBlockSize {
+		// We do not fill the token table.
+		// This will be picked up by caller.
+		dst.n = uint16(len(src))
+		return
+	}
+
+	// Override src
+	src = e.hist
+	nextEmit := s
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := int32(len(src) - inputMargin)
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	cv := load6432(src, s)
+	// Repeat MUST be > 1 and within range
+	repeat := int32(1)
+	for {
+		const skipLog = 7
+		const doEvery = 1
+
+		nextS := s
+		var l int32
+		var t int32
+		for {
+			nextHashS := hash4x64(cv, tableBits)
+			nextHashL := hash7(cv, tableBits)
+			s = nextS
+			nextS = s + doEvery + (s-nextEmit)>>skipLog
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			// Fetch a short+long candidate
+			sCandidate := e.table[nextHashS]
+			lCandidate := e.bTable[nextHashL]
+			next := load6432(src, nextS)
+			entry := tableEntry{offset: s + e.cur}
+			e.table[nextHashS] = entry
+			eLong := &e.bTable[nextHashL]
+			eLong.Cur, eLong.Prev = entry, eLong.Cur
+
+			// Calculate hashes of 'next'
+			nextHashS = hash4x64(next, tableBits)
+			nextHashL = hash7(next, tableBits)
+
+			t = lCandidate.Cur.offset - e.cur
+			if s-t < maxMatchOffset {
+				if uint32(cv) == load3232(src, lCandidate.Cur.offset-e.cur) {
+					// Long candidate matches at least 4 bytes.
+
+					// Store the next match
+					e.table[nextHashS] = tableEntry{offset: nextS + e.cur}
+					eLong := &e.bTable[nextHashL]
+					eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur}, eLong.Cur
+
+					// Check the previous long candidate as well.
+					t2 := lCandidate.Prev.offset - e.cur
+					if s-t2 < maxMatchOffset && uint32(cv) == load3232(src, lCandidate.Prev.offset-e.cur) {
+						l = e.matchlen(s+4, t+4, src) + 4
+						ml1 := e.matchlen(s+4, t2+4, src) + 4
+						if ml1 > l {
+							t = t2
+							l = ml1
+							break
+						}
+					}
+					break
+				}
+				// Current value did not match, but check if previous long value does.
+				t = lCandidate.Prev.offset - e.cur
+				if s-t < maxMatchOffset && uint32(cv) == load3232(src, lCandidate.Prev.offset-e.cur) {
+					// Store the next match
+					e.table[nextHashS] = tableEntry{offset: nextS + e.cur}
+					eLong := &e.bTable[nextHashL]
+					eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur}, eLong.Cur
+					break
+				}
+			}
+
+			t = sCandidate.offset - e.cur
+			if s-t < maxMatchOffset && uint32(cv) == load3232(src, sCandidate.offset-e.cur) {
+				// Found a 4 match...
+				l = e.matchlen(s+4, t+4, src) + 4
+
+				// Look up next long candidate (at nextS)
+				lCandidate = e.bTable[nextHashL]
+
+				// Store the next match
+				e.table[nextHashS] = tableEntry{offset: nextS + e.cur}
+				eLong := &e.bTable[nextHashL]
+				eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur}, eLong.Cur
+
+				// Check repeat at s + repOff
+				const repOff = 1
+				t2 := s - repeat + repOff
+				if load3232(src, t2) == uint32(cv>>(8*repOff)) {
+					ml := e.matchlen(s+4+repOff, t2+4, src) + 4
+					if ml > l {
+						t = t2
+						l = ml
+						s += repOff
+						// Not worth checking more.
+						break
+					}
+				}
+
+				// If the next long is a candidate, use that...
+				t2 = lCandidate.Cur.offset - e.cur
+				if nextS-t2 < maxMatchOffset {
+					if load3232(src, lCandidate.Cur.offset-e.cur) == uint32(next) {
+						ml := e.matchlen(nextS+4, t2+4, src) + 4
+						if ml > l {
+							t = t2
+							s = nextS
+							l = ml
+							// This is ok, but check previous as well.
+						}
+					}
+					// If the previous long is a candidate, use that...
+					t2 = lCandidate.Prev.offset - e.cur
+					if nextS-t2 < maxMatchOffset && load3232(src, lCandidate.Prev.offset-e.cur) == uint32(next) {
+						ml := e.matchlen(nextS+4, t2+4, src) + 4
+						if ml > l {
+							t = t2
+							s = nextS
+							l = ml
+							break
+						}
+					}
+				}
+				break
+			}
+			cv = next
+		}
+
+		// A 4-byte match has been found. We'll later see if more than 4 bytes
+		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+		// them as literal bytes.
+
+		// Extend the 4-byte match as long as possible.
+		if l == 0 {
+			l = e.matchlenLong(s+4, t+4, src) + 4
+		} else if l == maxMatchLength {
+			l += e.matchlenLong(s+l, t+l, src)
+		}
+
+		// Try to locate a better match by checking the end-of-match...
+		if sAt := s + l; sAt < sLimit {
+			eLong := &e.bTable[hash7(load6432(src, sAt), tableBits)]
+			// Test current
+			t2 := eLong.Cur.offset - e.cur - l
+			off := s - t2
+			if off < maxMatchOffset {
+				if off > 0 && t2 >= 0 {
+					if l2 := e.matchlenLong(s, t2, src); l2 > l {
+						t = t2
+						l = l2
+					}
+				}
+				// Test next:
+				t2 = eLong.Prev.offset - e.cur - l
+				off := s - t2
+				if off > 0 && off < maxMatchOffset && t2 >= 0 {
+					if l2 := e.matchlenLong(s, t2, src); l2 > l {
+						t = t2
+						l = l2
+					}
+				}
+			}
+		}
+
+		// Extend backwards
+		for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
+			s--
+			t--
+			l++
+		}
+		if nextEmit < s {
+			if false {
+				emitLiteral(dst, src[nextEmit:s])
+			} else {
+				for _, v := range src[nextEmit:s] {
+					dst.tokens[dst.n] = token(v)
+					dst.litHist[v]++
+					dst.n++
+				}
+			}
+		}
+		if false {
+			if t >= s {
+				panic(fmt.Sprintln("s-t", s, t))
+			}
+			if (s - t) > maxMatchOffset {
+				panic(fmt.Sprintln("mmo", s-t))
+			}
+			if l < baseMatchLength {
+				panic("bml")
+			}
+		}
+
+		dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
+		repeat = s - t
+		s += l
+		nextEmit = s
+		if nextS >= s {
+			s = nextS + 1
+		}
+
+		if s >= sLimit {
+			// Index after match end.
+			for i := nextS + 1; i < int32(len(src))-8; i += 2 {
+				cv := load6432(src, i)
+				e.table[hash4x64(cv, tableBits)] = tableEntry{offset: i + e.cur}
+				eLong := &e.bTable[hash7(cv, tableBits)]
+				eLong.Cur, eLong.Prev = tableEntry{offset: i + e.cur}, eLong.Cur
+			}
+			goto emitRemainder
+		}
+
+		// Store every long hash in-between and every second short.
+		if true {
+			for i := nextS + 1; i < s-1; i += 2 {
+				cv := load6432(src, i)
+				t := tableEntry{offset: i + e.cur}
+				t2 := tableEntry{offset: t.offset + 1}
+				eLong := &e.bTable[hash7(cv, tableBits)]
+				eLong2 := &e.bTable[hash7(cv>>8, tableBits)]
+				e.table[hash4x64(cv, tableBits)] = t
+				eLong.Cur, eLong.Prev = t, eLong.Cur
+				eLong2.Cur, eLong2.Prev = t2, eLong2.Cur
+			}
+		}
+
+		// We could immediately start working at s now, but to improve
+		// compression we first update the hash table at s-1 and at s.
+		cv = load6432(src, s)
+	}
+
+emitRemainder:
+	if int(nextEmit) < len(src) {
+		// If nothing was added, don't encode literals.
+		if dst.n == 0 {
+			return
+		}
+
+		emitLiteral(dst, src[nextEmit:])
+	}
+}
diff --git a/vendor/github.com/klauspost/compress/flate/regmask_amd64.go b/vendor/github.com/klauspost/compress/flate/regmask_amd64.go
new file mode 100644
index 000000000..6ed28061b
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/regmask_amd64.go
@@ -0,0 +1,37 @@
+package flate
+
+const (
+	// Masks for shifts with register sizes of the shift value.
+	// This can be used to work around the x86 design of shifting by mod register size.
+	// It can be used when a variable shift is always smaller than the register size.
+
+	// reg8SizeMaskX - shift value is 8 bits, shifted is X
+	reg8SizeMask8  = 7
+	reg8SizeMask16 = 15
+	reg8SizeMask32 = 31
+	reg8SizeMask64 = 63
+
+	// reg16SizeMaskX - shift value is 16 bits, shifted is X
+	reg16SizeMask8  = reg8SizeMask8
+	reg16SizeMask16 = reg8SizeMask16
+	reg16SizeMask32 = reg8SizeMask32
+	reg16SizeMask64 = reg8SizeMask64
+
+	// reg32SizeMaskX - shift value is 32 bits, shifted is X
+	reg32SizeMask8  = reg8SizeMask8
+	reg32SizeMask16 = reg8SizeMask16
+	reg32SizeMask32 = reg8SizeMask32
+	reg32SizeMask64 = reg8SizeMask64
+
+	// reg64SizeMaskX - shift value is 64 bits, shifted is X
+	reg64SizeMask8  = reg8SizeMask8
+	reg64SizeMask16 = reg8SizeMask16
+	reg64SizeMask32 = reg8SizeMask32
+	reg64SizeMask64 = reg8SizeMask64
+
+	// regSizeMaskUintX - shift value is uint, shifted is X
+	regSizeMaskUint8  = reg8SizeMask8
+	regSizeMaskUint16 = reg8SizeMask16
+	regSizeMaskUint32 = reg8SizeMask32
+	regSizeMaskUint64 = reg8SizeMask64
+)
diff --git a/vendor/github.com/klauspost/compress/flate/regmask_other.go b/vendor/github.com/klauspost/compress/flate/regmask_other.go
new file mode 100644
index 000000000..1b7a2cbd7
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/regmask_other.go
@@ -0,0 +1,40 @@
+//go:build !amd64
+// +build !amd64
+
+package flate
+
+const (
+	// Masks for shifts with register sizes of the shift value.
+	// This can be used to work around the x86 design of shifting by mod register size.
+	// It can be used when a variable shift is always smaller than the register size.
+
+	// reg8SizeMaskX - shift value is 8 bits, shifted is X
+	reg8SizeMask8  = 0xff
+	reg8SizeMask16 = 0xff
+	reg8SizeMask32 = 0xff
+	reg8SizeMask64 = 0xff
+
+	// reg16SizeMaskX - shift value is 16 bits, shifted is X
+	reg16SizeMask8  = 0xffff
+	reg16SizeMask16 = 0xffff
+	reg16SizeMask32 = 0xffff
+	reg16SizeMask64 = 0xffff
+
+	// reg32SizeMaskX - shift value is 32 bits, shifted is X
+	reg32SizeMask8  = 0xffffffff
+	reg32SizeMask16 = 0xffffffff
+	reg32SizeMask32 = 0xffffffff
+	reg32SizeMask64 = 0xffffffff
+
+	// reg64SizeMaskX - shift value is 64 bits, shifted is X
+	reg64SizeMask8  = 0xffffffffffffffff
+	reg64SizeMask16 = 0xffffffffffffffff
+	reg64SizeMask32 = 0xffffffffffffffff
+	reg64SizeMask64 = 0xffffffffffffffff
+
+	// regSizeMaskUintX - shift value is uint, shifted is X
+	regSizeMaskUint8  = ^uint(0)
+	regSizeMaskUint16 = ^uint(0)
+	regSizeMaskUint32 = ^uint(0)
+	regSizeMaskUint64 = ^uint(0)
+)
diff --git a/vendor/github.com/klauspost/compress/flate/stateless.go b/vendor/github.com/klauspost/compress/flate/stateless.go
new file mode 100644
index 000000000..93a1d1503
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/stateless.go
@@ -0,0 +1,305 @@
+package flate
+
+import (
+	"io"
+	"math"
+	"sync"
+)
+
+const (
+	maxStatelessBlock = math.MaxInt16
+	// dictionary will be taken from maxStatelessBlock, so limit it.
+	maxStatelessDict = 8 << 10
+
+	slTableBits  = 13
+	slTableSize  = 1 << slTableBits
+	slTableShift = 32 - slTableBits
+)
+
+type statelessWriter struct {
+	dst    io.Writer
+	closed bool
+}
+
+func (s *statelessWriter) Close() error {
+	if s.closed {
+		return nil
+	}
+	s.closed = true
+	// Emit EOF block
+	return StatelessDeflate(s.dst, nil, true, nil)
+}
+
+func (s *statelessWriter) Write(p []byte) (n int, err error) {
+	err = StatelessDeflate(s.dst, p, false, nil)
+	if err != nil {
+		return 0, err
+	}
+	return len(p), nil
+}
+
+func (s *statelessWriter) Reset(w io.Writer) {
+	s.dst = w
+	s.closed = false
+}
+
+// NewStatelessWriter will do compression but without maintaining any state
+// between Write calls.
+// There will be no memory kept between Write calls,
+// but compression and speed will be suboptimal.
+// Because of this, the size of actual Write calls will affect output size.
+func NewStatelessWriter(dst io.Writer) io.WriteCloser {
+	return &statelessWriter{dst: dst}
+}
+
+// bitWriterPool contains bit writers that can be reused.
+var bitWriterPool = sync.Pool{
+	New: func() interface{} {
+		return newHuffmanBitWriter(nil)
+	},
+}
+
+// StatelessDeflate allows compressing directly to a Writer without retaining state.
+// When returning everything will be flushed.
+// Up to 8KB of an optional dictionary can be given which is presumed to precede the block.
+// Longer dictionaries will be truncated and will still produce valid output.
+// Sending nil dictionary is perfectly fine.
+func StatelessDeflate(out io.Writer, in []byte, eof bool, dict []byte) error {
+	var dst tokens
+	bw := bitWriterPool.Get().(*huffmanBitWriter)
+	bw.reset(out)
+	defer func() {
+		// don't keep a reference to our output
+		bw.reset(nil)
+		bitWriterPool.Put(bw)
+	}()
+	if eof && len(in) == 0 {
+		// Just write an EOF block.
+		// Could be faster...
+		bw.writeStoredHeader(0, true)
+		bw.flush()
+		return bw.err
+	}
+
+	// Truncate dict
+	if len(dict) > maxStatelessDict {
+		dict = dict[len(dict)-maxStatelessDict:]
+	}
+
+	for len(in) > 0 {
+		todo := in
+		if len(todo) > maxStatelessBlock-len(dict) {
+			todo = todo[:maxStatelessBlock-len(dict)]
+		}
+		in = in[len(todo):]
+		uncompressed := todo
+		if len(dict) > 0 {
+			// combine dict and source
+			bufLen := len(todo) + len(dict)
+			combined := make([]byte, bufLen)
+			copy(combined, dict)
+			copy(combined[len(dict):], todo)
+			todo = combined
+		}
+		// Compress
+		statelessEnc(&dst, todo, int16(len(dict)))
+		isEof := eof && len(in) == 0
+
+		if dst.n == 0 {
+			bw.writeStoredHeader(len(uncompressed), isEof)
+			if bw.err != nil {
+				return bw.err
+			}
+			bw.writeBytes(uncompressed)
+		} else if int(dst.n) > len(uncompressed)-len(uncompressed)>>4 {
+			// If we removed less than 1/16th, huffman compress the block.
+			bw.writeBlockHuff(isEof, uncompressed, len(in) == 0)
+		} else {
+			bw.writeBlockDynamic(&dst, isEof, uncompressed, len(in) == 0)
+		}
+		if len(in) > 0 {
+			// Retain a dict if we have more
+			dict = todo[len(todo)-maxStatelessDict:]
+			dst.Reset()
+		}
+		if bw.err != nil {
+			return bw.err
+		}
+	}
+	if !eof {
+		// Align, only a stored block can do that.
+		bw.writeStoredHeader(0, false)
+	}
+	bw.flush()
+	return bw.err
+}
+
+func hashSL(u uint32) uint32 {
+	return (u * 0x1e35a7bd) >> slTableShift
+}
+
+func load3216(b []byte, i int16) uint32 {
+	// Help the compiler eliminate bounds checks on the read so it can be done in a single read.
+	b = b[i:]
+	b = b[:4]
+	return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
+}
+
+func load6416(b []byte, i int16) uint64 {
+	// Help the compiler eliminate bounds checks on the read so it can be done in a single read.
+	b = b[i:]
+	b = b[:8]
+	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
+		uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
+}
+
+func statelessEnc(dst *tokens, src []byte, startAt int16) {
+	const (
+		inputMargin            = 12 - 1
+		minNonLiteralBlockSize = 1 + 1 + inputMargin
+	)
+
+	type tableEntry struct {
+		offset int16
+	}
+
+	var table [slTableSize]tableEntry
+
+	// This check isn't in the Snappy implementation, but there, the caller
+	// instead of the callee handles this case.
+	if len(src)-int(startAt) < minNonLiteralBlockSize {
+		// We do not fill the token table.
+		// This will be picked up by caller.
+		dst.n = 0
+		return
+	}
+	// Index until startAt
+	if startAt > 0 {
+		cv := load3232(src, 0)
+		for i := int16(0); i < startAt; i++ {
+			table[hashSL(cv)] = tableEntry{offset: i}
+			cv = (cv >> 8) | (uint32(src[i+4]) << 24)
+		}
+	}
+
+	s := startAt + 1
+	nextEmit := startAt
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := int16(len(src) - inputMargin)
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	cv := load3216(src, s)
+
+	for {
+		const skipLog = 5
+		const doEvery = 2
+
+		nextS := s
+		var candidate tableEntry
+		for {
+			nextHash := hashSL(cv)
+			candidate = table[nextHash]
+			nextS = s + doEvery + (s-nextEmit)>>skipLog
+			if nextS > sLimit || nextS <= 0 {
+				goto emitRemainder
+			}
+
+			now := load6416(src, nextS)
+			table[nextHash] = tableEntry{offset: s}
+			nextHash = hashSL(uint32(now))
+
+			if cv == load3216(src, candidate.offset) {
+				table[nextHash] = tableEntry{offset: nextS}
+				break
+			}
+
+			// Do one right away...
+			cv = uint32(now)
+			s = nextS
+			nextS++
+			candidate = table[nextHash]
+			now >>= 8
+			table[nextHash] = tableEntry{offset: s}
+
+			if cv == load3216(src, candidate.offset) {
+				table[nextHash] = tableEntry{offset: nextS}
+				break
+			}
+			cv = uint32(now)
+			s = nextS
+		}
+
+		// A 4-byte match has been found. We'll later see if more than 4 bytes
+		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+		// them as literal bytes.
+		for {
+			// Invariant: we have a 4-byte match at s, and no need to emit any
+			// literal bytes prior to s.
+
+			// Extend the 4-byte match as long as possible.
+			t := candidate.offset
+			l := int16(matchLen(src[s+4:], src[t+4:]) + 4)
+
+			// Extend backwards
+			for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
+				s--
+				t--
+				l++
+			}
+			if nextEmit < s {
+				if false {
+					emitLiteral(dst, src[nextEmit:s])
+				} else {
+					for _, v := range src[nextEmit:s] {
+						dst.tokens[dst.n] = token(v)
+						dst.litHist[v]++
+						dst.n++
+					}
+				}
+			}
+
+			// Save the match found
+			dst.AddMatchLong(int32(l), uint32(s-t-baseMatchOffset))
+			s += l
+			nextEmit = s
+			if nextS >= s {
+				s = nextS + 1
+			}
+			if s >= sLimit {
+				goto emitRemainder
+			}
+
+			// We could immediately start working at s now, but to improve
+			// compression we first update the hash table at s-2 and at s. If
+			// another emitCopy is not our next move, also calculate nextHash
+			// at s+1. At least on GOARCH=amd64, these three hash calculations
+			// are faster as one load64 call (with some shifts) instead of
+			// three load32 calls.
+			x := load6416(src, s-2)
+			o := s - 2
+			prevHash := hashSL(uint32(x))
+			table[prevHash] = tableEntry{offset: o}
+			x >>= 16
+			currHash := hashSL(uint32(x))
+			candidate = table[currHash]
+			table[currHash] = tableEntry{offset: o + 2}
+
+			if uint32(x) != load3216(src, candidate.offset) {
+				cv = uint32(x >> 8)
+				s++
+				break
+			}
+		}
+	}
+
+emitRemainder:
+	if int(nextEmit) < len(src) {
+		// If nothing was added, don't encode literals.
+		if dst.n == 0 {
+			return
+		}
+		emitLiteral(dst, src[nextEmit:])
+	}
+}
diff --git a/vendor/github.com/klauspost/compress/flate/token.go b/vendor/github.com/klauspost/compress/flate/token.go
new file mode 100644
index 000000000..d818790c1
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/token.go
@@ -0,0 +1,379 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package flate
+
+import (
+	"bytes"
+	"encoding/binary"
+	"fmt"
+	"io"
+	"math"
+)
+
+const (
+	// bits 0-16  	xoffset = offset - MIN_OFFSET_SIZE, or literal - 16 bits
+	// bits 16-22	offsetcode - 5 bits
+	// bits 22-30   xlength = length - MIN_MATCH_LENGTH - 8 bits
+	// bits 30-32   type   0 = literal  1=EOF  2=Match   3=Unused - 2 bits
+	lengthShift         = 22
+	offsetMask          = 1<<lengthShift - 1
+	typeMask            = 3 << 30
+	literalType         = 0 << 30
+	matchType           = 1 << 30
+	matchOffsetOnlyMask = 0xffff
+)
+
+// The length code for length X (MIN_MATCH_LENGTH <= X <= MAX_MATCH_LENGTH)
+// is lengthCodes[length - MIN_MATCH_LENGTH]
+var lengthCodes = [256]uint8{
+	0, 1, 2, 3, 4, 5, 6, 7, 8, 8,
+	9, 9, 10, 10, 11, 11, 12, 12, 12, 12,
+	13, 13, 13, 13, 14, 14, 14, 14, 15, 15,
+	15, 15, 16, 16, 16, 16, 16, 16, 16, 16,
+	17, 17, 17, 17, 17, 17, 17, 17, 18, 18,
+	18, 18, 18, 18, 18, 18, 19, 19, 19, 19,
+	19, 19, 19, 19, 20, 20, 20, 20, 20, 20,
+	20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+	21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+	21, 21, 21, 21, 21, 21, 22, 22, 22, 22,
+	22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+	22, 22, 23, 23, 23, 23, 23, 23, 23, 23,
+	23, 23, 23, 23, 23, 23, 23, 23, 24, 24,
+	24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+	24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+	24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+	25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+	25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+	25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+	25, 25, 26, 26, 26, 26, 26, 26, 26, 26,
+	26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+	26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+	26, 26, 26, 26, 27, 27, 27, 27, 27, 27,
+	27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+	27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+	27, 27, 27, 27, 27, 28,
+}
+
+// lengthCodes1 is length codes, but starting at 1.
+var lengthCodes1 = [256]uint8{
+	1, 2, 3, 4, 5, 6, 7, 8, 9, 9,
+	10, 10, 11, 11, 12, 12, 13, 13, 13, 13,
+	14, 14, 14, 14, 15, 15, 15, 15, 16, 16,
+	16, 16, 17, 17, 17, 17, 17, 17, 17, 17,
+	18, 18, 18, 18, 18, 18, 18, 18, 19, 19,
+	19, 19, 19, 19, 19, 19, 20, 20, 20, 20,
+	20, 20, 20, 20, 21, 21, 21, 21, 21, 21,
+	21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+	22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+	22, 22, 22, 22, 22, 22, 23, 23, 23, 23,
+	23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
+	23, 23, 24, 24, 24, 24, 24, 24, 24, 24,
+	24, 24, 24, 24, 24, 24, 24, 24, 25, 25,
+	25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+	25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+	25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+	26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+	26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+	26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+	26, 26, 27, 27, 27, 27, 27, 27, 27, 27,
+	27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+	27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+	27, 27, 27, 27, 28, 28, 28, 28, 28, 28,
+	28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+	28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+	28, 28, 28, 28, 28, 29,
+}
+
+var offsetCodes = [256]uint32{
+	0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7,
+	8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9,
+	10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+	11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+	12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+	12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+	13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+	13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+	14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+	14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+	14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+	14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+}
+
+// offsetCodes14 are offsetCodes, but with 14 added.
+var offsetCodes14 = [256]uint32{
+	14, 15, 16, 17, 18, 18, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21,
+	22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23,
+	24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+	25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+	26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+	26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+	27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+	27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+	28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+	28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+	28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+	28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+	29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
+	29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
+	29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
+	29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
+}
+
+type token uint32
+
+type tokens struct {
+	extraHist [32]uint16  // codes 256->maxnumlit
+	offHist   [32]uint16  // offset codes
+	litHist   [256]uint16 // codes 0->255
+	nFilled   int
+	n         uint16 // Must be able to contain maxStoreBlockSize
+	tokens    [maxStoreBlockSize + 1]token
+}
+
+func (t *tokens) Reset() {
+	if t.n == 0 {
+		return
+	}
+	t.n = 0
+	t.nFilled = 0
+	for i := range t.litHist[:] {
+		t.litHist[i] = 0
+	}
+	for i := range t.extraHist[:] {
+		t.extraHist[i] = 0
+	}
+	for i := range t.offHist[:] {
+		t.offHist[i] = 0
+	}
+}
+
+func (t *tokens) Fill() {
+	if t.n == 0 {
+		return
+	}
+	for i, v := range t.litHist[:] {
+		if v == 0 {
+			t.litHist[i] = 1
+			t.nFilled++
+		}
+	}
+	for i, v := range t.extraHist[:literalCount-256] {
+		if v == 0 {
+			t.nFilled++
+			t.extraHist[i] = 1
+		}
+	}
+	for i, v := range t.offHist[:offsetCodeCount] {
+		if v == 0 {
+			t.offHist[i] = 1
+		}
+	}
+}
+
+func indexTokens(in []token) tokens {
+	var t tokens
+	t.indexTokens(in)
+	return t
+}
+
+func (t *tokens) indexTokens(in []token) {
+	t.Reset()
+	for _, tok := range in {
+		if tok < matchType {
+			t.AddLiteral(tok.literal())
+			continue
+		}
+		t.AddMatch(uint32(tok.length()), tok.offset()&matchOffsetOnlyMask)
+	}
+}
+
+// emitLiteral writes a literal chunk and returns the number of bytes written.
+func emitLiteral(dst *tokens, lit []byte) {
+	for _, v := range lit {
+		dst.tokens[dst.n] = token(v)
+		dst.litHist[v]++
+		dst.n++
+	}
+}
+
+func (t *tokens) AddLiteral(lit byte) {
+	t.tokens[t.n] = token(lit)
+	t.litHist[lit]++
+	t.n++
+}
+
+// from https://stackoverflow.com/a/28730362
+func mFastLog2(val float32) float32 {
+	ux := int32(math.Float32bits(val))
+	log2 := (float32)(((ux >> 23) & 255) - 128)
+	ux &= -0x7f800001
+	ux += 127 << 23
+	uval := math.Float32frombits(uint32(ux))
+	log2 += ((-0.34484843)*uval+2.02466578)*uval - 0.67487759
+	return log2
+}
+
+// EstimatedBits will return an minimum size estimated by an *optimal*
+// compression of the block.
+// The size of the block
+func (t *tokens) EstimatedBits() int {
+	shannon := float32(0)
+	bits := int(0)
+	nMatches := 0
+	total := int(t.n) + t.nFilled
+	if total > 0 {
+		invTotal := 1.0 / float32(total)
+		for _, v := range t.litHist[:] {
+			if v > 0 {
+				n := float32(v)
+				shannon += atLeastOne(-mFastLog2(n*invTotal)) * n
+			}
+		}
+		// Just add 15 for EOB
+		shannon += 15
+		for i, v := range t.extraHist[1 : literalCount-256] {
+			if v > 0 {
+				n := float32(v)
+				shannon += atLeastOne(-mFastLog2(n*invTotal)) * n
+				bits += int(lengthExtraBits[i&31]) * int(v)
+				nMatches += int(v)
+			}
+		}
+	}
+	if nMatches > 0 {
+		invTotal := 1.0 / float32(nMatches)
+		for i, v := range t.offHist[:offsetCodeCount] {
+			if v > 0 {
+				n := float32(v)
+				shannon += atLeastOne(-mFastLog2(n*invTotal)) * n
+				bits += int(offsetExtraBits[i&31]) * int(v)
+			}
+		}
+	}
+	return int(shannon) + bits
+}
+
+// AddMatch adds a match to the tokens.
+// This function is very sensitive to inlining and right on the border.
+func (t *tokens) AddMatch(xlength uint32, xoffset uint32) {
+	if debugDeflate {
+		if xlength >= maxMatchLength+baseMatchLength {
+			panic(fmt.Errorf("invalid length: %v", xlength))
+		}
+		if xoffset >= maxMatchOffset+baseMatchOffset {
+			panic(fmt.Errorf("invalid offset: %v", xoffset))
+		}
+	}
+	oCode := offsetCode(xoffset)
+	xoffset |= oCode << 16
+
+	t.extraHist[lengthCodes1[uint8(xlength)]]++
+	t.offHist[oCode&31]++
+	t.tokens[t.n] = token(matchType | xlength<<lengthShift | xoffset)
+	t.n++
+}
+
+// AddMatchLong adds a match to the tokens, potentially longer than max match length.
+// Length should NOT have the base subtracted, only offset should.
+func (t *tokens) AddMatchLong(xlength int32, xoffset uint32) {
+	if debugDeflate {
+		if xoffset >= maxMatchOffset+baseMatchOffset {
+			panic(fmt.Errorf("invalid offset: %v", xoffset))
+		}
+	}
+	oc := offsetCode(xoffset)
+	xoffset |= oc << 16
+	for xlength > 0 {
+		xl := xlength
+		if xl > 258 {
+			// We need to have at least baseMatchLength left over for next loop.
+			if xl > 258+baseMatchLength {
+				xl = 258
+			} else {
+				xl = 258 - baseMatchLength
+			}
+		}
+		xlength -= xl
+		xl -= baseMatchLength
+		t.extraHist[lengthCodes1[uint8(xl)]]++
+		t.offHist[oc&31]++
+		t.tokens[t.n] = token(matchType | uint32(xl)<<lengthShift | xoffset)
+		t.n++
+	}
+}
+
+func (t *tokens) AddEOB() {
+	t.tokens[t.n] = token(endBlockMarker)
+	t.extraHist[0]++
+	t.n++
+}
+
+func (t *tokens) Slice() []token {
+	return t.tokens[:t.n]
+}
+
+// VarInt returns the tokens as varint encoded bytes.
+func (t *tokens) VarInt() []byte {
+	var b = make([]byte, binary.MaxVarintLen32*int(t.n))
+	var off int
+	for _, v := range t.tokens[:t.n] {
+		off += binary.PutUvarint(b[off:], uint64(v))
+	}
+	return b[:off]
+}
+
+// FromVarInt restores t to the varint encoded tokens provided.
+// Any data in t is removed.
+func (t *tokens) FromVarInt(b []byte) error {
+	var buf = bytes.NewReader(b)
+	var toks []token
+	for {
+		r, err := binary.ReadUvarint(buf)
+		if err == io.EOF {
+			break
+		}
+		if err != nil {
+			return err
+		}
+		toks = append(toks, token(r))
+	}
+	t.indexTokens(toks)
+	return nil
+}
+
+// Returns the type of a token
+func (t token) typ() uint32 { return uint32(t) & typeMask }
+
+// Returns the literal of a literal token
+func (t token) literal() uint8 { return uint8(t) }
+
+// Returns the extra offset of a match token
+func (t token) offset() uint32 { return uint32(t) & offsetMask }
+
+func (t token) length() uint8 { return uint8(t >> lengthShift) }
+
+// Convert length to code.
+func lengthCode(len uint8) uint8 { return lengthCodes[len] }
+
+// Returns the offset code corresponding to a specific offset
+func offsetCode(off uint32) uint32 {
+	if false {
+		if off < uint32(len(offsetCodes)) {
+			return offsetCodes[off&255]
+		} else if off>>7 < uint32(len(offsetCodes)) {
+			return offsetCodes[(off>>7)&255] + 14
+		} else {
+			return offsetCodes[(off>>14)&255] + 28
+		}
+	}
+	if off < uint32(len(offsetCodes)) {
+		return offsetCodes[uint8(off)]
+	}
+	return offsetCodes14[uint8(off>>7)]
+}
diff --git a/vendor/github.com/klauspost/compress/gzip/gunzip.go b/vendor/github.com/klauspost/compress/gzip/gunzip.go
new file mode 100644
index 000000000..66fe5ddf7
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/gzip/gunzip.go
@@ -0,0 +1,349 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package gzip implements reading and writing of gzip format compressed files,
+// as specified in RFC 1952.
+package gzip
+
+import (
+	"bufio"
+	"compress/gzip"
+	"encoding/binary"
+	"hash/crc32"
+	"io"
+	"time"
+
+	"github.com/klauspost/compress/flate"
+)
+
+const (
+	gzipID1     = 0x1f
+	gzipID2     = 0x8b
+	gzipDeflate = 8
+	flagText    = 1 << 0
+	flagHdrCrc  = 1 << 1
+	flagExtra   = 1 << 2
+	flagName    = 1 << 3
+	flagComment = 1 << 4
+)
+
+var (
+	// ErrChecksum is returned when reading GZIP data that has an invalid checksum.
+	ErrChecksum = gzip.ErrChecksum
+	// ErrHeader is returned when reading GZIP data that has an invalid header.
+	ErrHeader = gzip.ErrHeader
+)
+
+var le = binary.LittleEndian
+
+// noEOF converts io.EOF to io.ErrUnexpectedEOF.
+func noEOF(err error) error {
+	if err == io.EOF {
+		return io.ErrUnexpectedEOF
+	}
+	return err
+}
+
+// The gzip file stores a header giving metadata about the compressed file.
+// That header is exposed as the fields of the Writer and Reader structs.
+//
+// Strings must be UTF-8 encoded and may only contain Unicode code points
+// U+0001 through U+00FF, due to limitations of the GZIP file format.
+type Header struct {
+	Comment string    // comment
+	Extra   []byte    // "extra data"
+	ModTime time.Time // modification time
+	Name    string    // file name
+	OS      byte      // operating system type
+}
+
+// A Reader is an io.Reader that can be read to retrieve
+// uncompressed data from a gzip-format compressed file.
+//
+// In general, a gzip file can be a concatenation of gzip files,
+// each with its own header. Reads from the Reader
+// return the concatenation of the uncompressed data of each.
+// Only the first header is recorded in the Reader fields.
+//
+// Gzip files store a length and checksum of the uncompressed data.
+// The Reader will return a ErrChecksum when Read
+// reaches the end of the uncompressed data if it does not
+// have the expected length or checksum. Clients should treat data
+// returned by Read as tentative until they receive the io.EOF
+// marking the end of the data.
+type Reader struct {
+	Header       // valid after NewReader or Reader.Reset
+	r            flate.Reader
+	br           *bufio.Reader
+	decompressor io.ReadCloser
+	digest       uint32 // CRC-32, IEEE polynomial (section 8)
+	size         uint32 // Uncompressed size (section 2.3.1)
+	buf          [512]byte
+	err          error
+	multistream  bool
+}
+
+// NewReader creates a new Reader reading the given reader.
+// If r does not also implement io.ByteReader,
+// the decompressor may read more data than necessary from r.
+//
+// It is the caller's responsibility to call Close on the Reader when done.
+//
+// The Reader.Header fields will be valid in the Reader returned.
+func NewReader(r io.Reader) (*Reader, error) {
+	z := new(Reader)
+	if err := z.Reset(r); err != nil {
+		return nil, err
+	}
+	return z, nil
+}
+
+// Reset discards the Reader z's state and makes it equivalent to the
+// result of its original state from NewReader, but reading from r instead.
+// This permits reusing a Reader rather than allocating a new one.
+func (z *Reader) Reset(r io.Reader) error {
+	*z = Reader{
+		decompressor: z.decompressor,
+		multistream:  true,
+	}
+	if rr, ok := r.(flate.Reader); ok {
+		z.r = rr
+	} else {
+		// Reuse if we can.
+		if z.br != nil {
+			z.br.Reset(r)
+		} else {
+			z.br = bufio.NewReader(r)
+		}
+		z.r = z.br
+	}
+	z.Header, z.err = z.readHeader()
+	return z.err
+}
+
+// Multistream controls whether the reader supports multistream files.
+//
+// If enabled (the default), the Reader expects the input to be a sequence
+// of individually gzipped data streams, each with its own header and
+// trailer, ending at EOF. The effect is that the concatenation of a sequence
+// of gzipped files is treated as equivalent to the gzip of the concatenation
+// of the sequence. This is standard behavior for gzip readers.
+//
+// Calling Multistream(false) disables this behavior; disabling the behavior
+// can be useful when reading file formats that distinguish individual gzip
+// data streams or mix gzip data streams with other data streams.
+// In this mode, when the Reader reaches the end of the data stream,
+// Read returns io.EOF. If the underlying reader implements io.ByteReader,
+// it will be left positioned just after the gzip stream.
+// To start the next stream, call z.Reset(r) followed by z.Multistream(false).
+// If there is no next stream, z.Reset(r) will return io.EOF.
+func (z *Reader) Multistream(ok bool) {
+	z.multistream = ok
+}
+
+// readString reads a NUL-terminated string from z.r.
+// It treats the bytes read as being encoded as ISO 8859-1 (Latin-1) and
+// will output a string encoded using UTF-8.
+// This method always updates z.digest with the data read.
+func (z *Reader) readString() (string, error) {
+	var err error
+	needConv := false
+	for i := 0; ; i++ {
+		if i >= len(z.buf) {
+			return "", ErrHeader
+		}
+		z.buf[i], err = z.r.ReadByte()
+		if err != nil {
+			return "", err
+		}
+		if z.buf[i] > 0x7f {
+			needConv = true
+		}
+		if z.buf[i] == 0 {
+			// Digest covers the NUL terminator.
+			z.digest = crc32.Update(z.digest, crc32.IEEETable, z.buf[:i+1])
+
+			// Strings are ISO 8859-1, Latin-1 (RFC 1952, section 2.3.1).
+			if needConv {
+				s := make([]rune, 0, i)
+				for _, v := range z.buf[:i] {
+					s = append(s, rune(v))
+				}
+				return string(s), nil
+			}
+			return string(z.buf[:i]), nil
+		}
+	}
+}
+
+// readHeader reads the GZIP header according to section 2.3.1.
+// This method does not set z.err.
+func (z *Reader) readHeader() (hdr Header, err error) {
+	if _, err = io.ReadFull(z.r, z.buf[:10]); err != nil {
+		// RFC 1952, section 2.2, says the following:
+		//	A gzip file consists of a series of "members" (compressed data sets).
+		//
+		// Other than this, the specification does not clarify whether a
+		// "series" is defined as "one or more" or "zero or more". To err on the
+		// side of caution, Go interprets this to mean "zero or more".
+		// Thus, it is okay to return io.EOF here.
+		return hdr, err
+	}
+	if z.buf[0] != gzipID1 || z.buf[1] != gzipID2 || z.buf[2] != gzipDeflate {
+		return hdr, ErrHeader
+	}
+	flg := z.buf[3]
+	hdr.ModTime = time.Unix(int64(le.Uint32(z.buf[4:8])), 0)
+	// z.buf[8] is XFL and is currently ignored.
+	hdr.OS = z.buf[9]
+	z.digest = crc32.ChecksumIEEE(z.buf[:10])
+
+	if flg&flagExtra != 0 {
+		if _, err = io.ReadFull(z.r, z.buf[:2]); err != nil {
+			return hdr, noEOF(err)
+		}
+		z.digest = crc32.Update(z.digest, crc32.IEEETable, z.buf[:2])
+		data := make([]byte, le.Uint16(z.buf[:2]))
+		if _, err = io.ReadFull(z.r, data); err != nil {
+			return hdr, noEOF(err)
+		}
+		z.digest = crc32.Update(z.digest, crc32.IEEETable, data)
+		hdr.Extra = data
+	}
+
+	var s string
+	if flg&flagName != 0 {
+		if s, err = z.readString(); err != nil {
+			return hdr, err
+		}
+		hdr.Name = s
+	}
+
+	if flg&flagComment != 0 {
+		if s, err = z.readString(); err != nil {
+			return hdr, err
+		}
+		hdr.Comment = s
+	}
+
+	if flg&flagHdrCrc != 0 {
+		if _, err = io.ReadFull(z.r, z.buf[:2]); err != nil {
+			return hdr, noEOF(err)
+		}
+		digest := le.Uint16(z.buf[:2])
+		if digest != uint16(z.digest) {
+			return hdr, ErrHeader
+		}
+	}
+
+	z.digest = 0
+	if z.decompressor == nil {
+		z.decompressor = flate.NewReader(z.r)
+	} else {
+		z.decompressor.(flate.Resetter).Reset(z.r, nil)
+	}
+	return hdr, nil
+}
+
+// Read implements io.Reader, reading uncompressed bytes from its underlying Reader.
+func (z *Reader) Read(p []byte) (n int, err error) {
+	if z.err != nil {
+		return 0, z.err
+	}
+
+	for n == 0 {
+		n, z.err = z.decompressor.Read(p)
+		z.digest = crc32.Update(z.digest, crc32.IEEETable, p[:n])
+		z.size += uint32(n)
+		if z.err != io.EOF {
+			// In the normal case we return here.
+			return n, z.err
+		}
+
+		// Finished file; check checksum and size.
+		if _, err := io.ReadFull(z.r, z.buf[:8]); err != nil {
+			z.err = noEOF(err)
+			return n, z.err
+		}
+		digest := le.Uint32(z.buf[:4])
+		size := le.Uint32(z.buf[4:8])
+		if digest != z.digest || size != z.size {
+			z.err = ErrChecksum
+			return n, z.err
+		}
+		z.digest, z.size = 0, 0
+
+		// File is ok; check if there is another.
+		if !z.multistream {
+			return n, io.EOF
+		}
+		z.err = nil // Remove io.EOF
+
+		if _, z.err = z.readHeader(); z.err != nil {
+			return n, z.err
+		}
+	}
+
+	return n, nil
+}
+
+// Support the io.WriteTo interface for io.Copy and friends.
+func (z *Reader) WriteTo(w io.Writer) (int64, error) {
+	total := int64(0)
+	crcWriter := crc32.NewIEEE()
+	for {
+		if z.err != nil {
+			if z.err == io.EOF {
+				return total, nil
+			}
+			return total, z.err
+		}
+
+		// We write both to output and digest.
+		mw := io.MultiWriter(w, crcWriter)
+		n, err := z.decompressor.(io.WriterTo).WriteTo(mw)
+		total += n
+		z.size += uint32(n)
+		if err != nil {
+			z.err = err
+			return total, z.err
+		}
+
+		// Finished file; check checksum + size.
+		if _, err := io.ReadFull(z.r, z.buf[0:8]); err != nil {
+			if err == io.EOF {
+				err = io.ErrUnexpectedEOF
+			}
+			z.err = err
+			return total, err
+		}
+		z.digest = crcWriter.Sum32()
+		digest := le.Uint32(z.buf[:4])
+		size := le.Uint32(z.buf[4:8])
+		if digest != z.digest || size != z.size {
+			z.err = ErrChecksum
+			return total, z.err
+		}
+		z.digest, z.size = 0, 0
+
+		// File is ok; check if there is another.
+		if !z.multistream {
+			return total, nil
+		}
+		crcWriter.Reset()
+		z.err = nil // Remove io.EOF
+
+		if _, z.err = z.readHeader(); z.err != nil {
+			if z.err == io.EOF {
+				return total, nil
+			}
+			return total, z.err
+		}
+	}
+}
+
+// Close closes the Reader. It does not close the underlying io.Reader.
+// In order for the GZIP checksum to be verified, the reader must be
+// fully consumed until the io.EOF.
+func (z *Reader) Close() error { return z.decompressor.Close() }
diff --git a/vendor/github.com/klauspost/compress/gzip/gzip.go b/vendor/github.com/klauspost/compress/gzip/gzip.go
new file mode 100644
index 000000000..26203851b
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/gzip/gzip.go
@@ -0,0 +1,269 @@
+// Copyright 2010 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gzip
+
+import (
+	"errors"
+	"fmt"
+	"hash/crc32"
+	"io"
+
+	"github.com/klauspost/compress/flate"
+)
+
+// These constants are copied from the flate package, so that code that imports
+// "compress/gzip" does not also have to import "compress/flate".
+const (
+	NoCompression       = flate.NoCompression
+	BestSpeed           = flate.BestSpeed
+	BestCompression     = flate.BestCompression
+	DefaultCompression  = flate.DefaultCompression
+	ConstantCompression = flate.ConstantCompression
+	HuffmanOnly         = flate.HuffmanOnly
+
+	// StatelessCompression will do compression but without maintaining any state
+	// between Write calls.
+	// There will be no memory kept between Write calls,
+	// but compression and speed will be suboptimal.
+	// Because of this, the size of actual Write calls will affect output size.
+	StatelessCompression = -3
+)
+
+// A Writer is an io.WriteCloser.
+// Writes to a Writer are compressed and written to w.
+type Writer struct {
+	Header      // written at first call to Write, Flush, or Close
+	w           io.Writer
+	level       int
+	err         error
+	compressor  *flate.Writer
+	digest      uint32 // CRC-32, IEEE polynomial (section 8)
+	size        uint32 // Uncompressed size (section 2.3.1)
+	wroteHeader bool
+	closed      bool
+	buf         [10]byte
+}
+
+// NewWriter returns a new Writer.
+// Writes to the returned writer are compressed and written to w.
+//
+// It is the caller's responsibility to call Close on the WriteCloser when done.
+// Writes may be buffered and not flushed until Close.
+//
+// Callers that wish to set the fields in Writer.Header must do so before
+// the first call to Write, Flush, or Close.
+func NewWriter(w io.Writer) *Writer {
+	z, _ := NewWriterLevel(w, DefaultCompression)
+	return z
+}
+
+// NewWriterLevel is like NewWriter but specifies the compression level instead
+// of assuming DefaultCompression.
+//
+// The compression level can be DefaultCompression, NoCompression, or any
+// integer value between BestSpeed and BestCompression inclusive. The error
+// returned will be nil if the level is valid.
+func NewWriterLevel(w io.Writer, level int) (*Writer, error) {
+	if level < StatelessCompression || level > BestCompression {
+		return nil, fmt.Errorf("gzip: invalid compression level: %d", level)
+	}
+	z := new(Writer)
+	z.init(w, level)
+	return z, nil
+}
+
+func (z *Writer) init(w io.Writer, level int) {
+	compressor := z.compressor
+	if level != StatelessCompression {
+		if compressor != nil {
+			compressor.Reset(w)
+		}
+	}
+
+	*z = Writer{
+		Header: Header{
+			OS: 255, // unknown
+		},
+		w:          w,
+		level:      level,
+		compressor: compressor,
+	}
+}
+
+// Reset discards the Writer z's state and makes it equivalent to the
+// result of its original state from NewWriter or NewWriterLevel, but
+// writing to w instead. This permits reusing a Writer rather than
+// allocating a new one.
+func (z *Writer) Reset(w io.Writer) {
+	z.init(w, z.level)
+}
+
+// writeBytes writes a length-prefixed byte slice to z.w.
+func (z *Writer) writeBytes(b []byte) error {
+	if len(b) > 0xffff {
+		return errors.New("gzip.Write: Extra data is too large")
+	}
+	le.PutUint16(z.buf[:2], uint16(len(b)))
+	_, err := z.w.Write(z.buf[:2])
+	if err != nil {
+		return err
+	}
+	_, err = z.w.Write(b)
+	return err
+}
+
+// writeString writes a UTF-8 string s in GZIP's format to z.w.
+// GZIP (RFC 1952) specifies that strings are NUL-terminated ISO 8859-1 (Latin-1).
+func (z *Writer) writeString(s string) (err error) {
+	// GZIP stores Latin-1 strings; error if non-Latin-1; convert if non-ASCII.
+	needconv := false
+	for _, v := range s {
+		if v == 0 || v > 0xff {
+			return errors.New("gzip.Write: non-Latin-1 header string")
+		}
+		if v > 0x7f {
+			needconv = true
+		}
+	}
+	if needconv {
+		b := make([]byte, 0, len(s))
+		for _, v := range s {
+			b = append(b, byte(v))
+		}
+		_, err = z.w.Write(b)
+	} else {
+		_, err = io.WriteString(z.w, s)
+	}
+	if err != nil {
+		return err
+	}
+	// GZIP strings are NUL-terminated.
+	z.buf[0] = 0
+	_, err = z.w.Write(z.buf[:1])
+	return err
+}
+
+// Write writes a compressed form of p to the underlying io.Writer. The
+// compressed bytes are not necessarily flushed until the Writer is closed.
+func (z *Writer) Write(p []byte) (int, error) {
+	if z.err != nil {
+		return 0, z.err
+	}
+	var n int
+	// Write the GZIP header lazily.
+	if !z.wroteHeader {
+		z.wroteHeader = true
+		z.buf[0] = gzipID1
+		z.buf[1] = gzipID2
+		z.buf[2] = gzipDeflate
+		z.buf[3] = 0
+		if z.Extra != nil {
+			z.buf[3] |= 0x04
+		}
+		if z.Name != "" {
+			z.buf[3] |= 0x08
+		}
+		if z.Comment != "" {
+			z.buf[3] |= 0x10
+		}
+		le.PutUint32(z.buf[4:8], uint32(z.ModTime.Unix()))
+		if z.level == BestCompression {
+			z.buf[8] = 2
+		} else if z.level == BestSpeed {
+			z.buf[8] = 4
+		} else {
+			z.buf[8] = 0
+		}
+		z.buf[9] = z.OS
+		n, z.err = z.w.Write(z.buf[:10])
+		if z.err != nil {
+			return n, z.err
+		}
+		if z.Extra != nil {
+			z.err = z.writeBytes(z.Extra)
+			if z.err != nil {
+				return n, z.err
+			}
+		}
+		if z.Name != "" {
+			z.err = z.writeString(z.Name)
+			if z.err != nil {
+				return n, z.err
+			}
+		}
+		if z.Comment != "" {
+			z.err = z.writeString(z.Comment)
+			if z.err != nil {
+				return n, z.err
+			}
+		}
+
+		if z.compressor == nil && z.level != StatelessCompression {
+			z.compressor, _ = flate.NewWriter(z.w, z.level)
+		}
+	}
+	z.size += uint32(len(p))
+	z.digest = crc32.Update(z.digest, crc32.IEEETable, p)
+	if z.level == StatelessCompression {
+		return len(p), flate.StatelessDeflate(z.w, p, false, nil)
+	}
+	n, z.err = z.compressor.Write(p)
+	return n, z.err
+}
+
+// Flush flushes any pending compressed data to the underlying writer.
+//
+// It is useful mainly in compressed network protocols, to ensure that
+// a remote reader has enough data to reconstruct a packet. Flush does
+// not return until the data has been written. If the underlying
+// writer returns an error, Flush returns that error.
+//
+// In the terminology of the zlib library, Flush is equivalent to Z_SYNC_FLUSH.
+func (z *Writer) Flush() error {
+	if z.err != nil {
+		return z.err
+	}
+	if z.closed || z.level == StatelessCompression {
+		return nil
+	}
+	if !z.wroteHeader {
+		z.Write(nil)
+		if z.err != nil {
+			return z.err
+		}
+	}
+	z.err = z.compressor.Flush()
+	return z.err
+}
+
+// Close closes the Writer, flushing any unwritten data to the underlying
+// io.Writer, but does not close the underlying io.Writer.
+func (z *Writer) Close() error {
+	if z.err != nil {
+		return z.err
+	}
+	if z.closed {
+		return nil
+	}
+	z.closed = true
+	if !z.wroteHeader {
+		z.Write(nil)
+		if z.err != nil {
+			return z.err
+		}
+	}
+	if z.level == StatelessCompression {
+		z.err = flate.StatelessDeflate(z.w, nil, true, nil)
+	} else {
+		z.err = z.compressor.Close()
+	}
+	if z.err != nil {
+		return z.err
+	}
+	le.PutUint32(z.buf[:4], z.digest)
+	le.PutUint32(z.buf[4:8], z.size)
+	_, z.err = z.w.Write(z.buf[:8])
+	return z.err
+}
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go
index 671e630a8..9f3e9f79e 100644
--- a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go
@@ -27,10 +27,7 @@ func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext)
 const fallback8BitSize = 800
 
 type decompress4xContext struct {
-	pbr0     *bitReaderShifted
-	pbr1     *bitReaderShifted
-	pbr2     *bitReaderShifted
-	pbr3     *bitReaderShifted
+	pbr      *[4]bitReaderShifted
 	peekBits uint8
 	out      *byte
 	dstEvery int
@@ -89,10 +86,7 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
 
 	if len(out) > 4*4 && !(br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4) {
 		ctx := decompress4xContext{
-			pbr0:     &br[0],
-			pbr1:     &br[1],
-			pbr2:     &br[2],
-			pbr3:     &br[3],
+			pbr:      &br,
 			peekBits: uint8((64 - d.actualTableLog) & 63), // see: bitReaderShifted.peekBitsFast()
 			out:      &out[0],
 			dstEvery: dstEvery,
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s
index 6c65c6e2b..dd1a5aecd 100644
--- a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s
@@ -4,45 +4,40 @@
 // +build amd64,!appengine,!noasm,gc
 
 // func decompress4x_main_loop_amd64(ctx *decompress4xContext)
-TEXT ·decompress4x_main_loop_amd64(SB), $8-8
+TEXT ·decompress4x_main_loop_amd64(SB), $0-8
 	XORQ DX, DX
 
 	// Preload values
 	MOVQ    ctx+0(FP), AX
-	MOVBQZX 32(AX), SI
-	MOVQ    40(AX), DI
-	MOVQ    DI, BX
-	MOVQ    72(AX), CX
-	MOVQ    CX, (SP)
-	MOVQ    48(AX), R8
-	MOVQ    56(AX), R9
-	MOVQ    (AX), R10
-	MOVQ    8(AX), R11
-	MOVQ    16(AX), R12
-	MOVQ    24(AX), R13
+	MOVBQZX 8(AX), DI
+	MOVQ    16(AX), SI
+	MOVQ    48(AX), BX
+	MOVQ    24(AX), R9
+	MOVQ    32(AX), R10
+	MOVQ    (AX), R11
 
 	// Main loop
 main_loop:
-	MOVQ  BX, DI
-	CMPQ  DI, (SP)
+	MOVQ  SI, R8
+	CMPQ  R8, BX
 	SETGE DL
 
 	// br0.fillFast32()
-	MOVQ    32(R10), R14
-	MOVBQZX 40(R10), R15
-	CMPQ    R15, $0x20
+	MOVQ    32(R11), R12
+	MOVBQZX 40(R11), R13
+	CMPQ    R13, $0x20
 	JBE     skip_fill0
-	MOVQ    24(R10), AX
-	SUBQ    $0x20, R15
+	MOVQ    24(R11), AX
+	SUBQ    $0x20, R13
 	SUBQ    $0x04, AX
-	MOVQ    (R10), BP
+	MOVQ    (R11), R14
 
 	// b.value |= uint64(low) << (b.bitsRead & 63)
-	MOVL (AX)(BP*1), BP
-	MOVQ R15, CX
-	SHLQ CL, BP
-	MOVQ AX, 24(R10)
-	ORQ  BP, R14
+	MOVL (AX)(R14*1), R14
+	MOVQ R13, CX
+	SHLQ CL, R14
+	MOVQ AX, 24(R11)
+	ORQ  R14, R12
 
 	// exhausted = exhausted || (br0.off < 4)
 	CMPQ  AX, $0x04
@@ -51,57 +46,57 @@ main_loop:
 
 skip_fill0:
 	// val0 := br0.peekTopBits(peekBits)
-	MOVQ R14, BP
-	MOVQ SI, CX
-	SHRQ CL, BP
+	MOVQ R12, R14
+	MOVQ DI, CX
+	SHRQ CL, R14
 
 	// v0 := table[val0&mask]
-	MOVW (R9)(BP*2), CX
+	MOVW (R10)(R14*2), CX
 
 	// br0.advance(uint8(v0.entry)
 	MOVB CH, AL
-	SHLQ CL, R14
-	ADDB CL, R15
+	SHLQ CL, R12
+	ADDB CL, R13
 
 	// val1 := br0.peekTopBits(peekBits)
-	MOVQ SI, CX
-	MOVQ R14, BP
-	SHRQ CL, BP
+	MOVQ DI, CX
+	MOVQ R12, R14
+	SHRQ CL, R14
 
 	// v1 := table[val1&mask]
-	MOVW (R9)(BP*2), CX
+	MOVW (R10)(R14*2), CX
 
 	// br0.advance(uint8(v1.entry))
 	MOVB CH, AH
-	SHLQ CL, R14
-	ADDB CL, R15
+	SHLQ CL, R12
+	ADDB CL, R13
 
 	// these two writes get coalesced
 	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
 	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
-	MOVW AX, (DI)
+	MOVW AX, (R8)
 
-	// update the bitrader reader structure
-	MOVQ R14, 32(R10)
-	MOVB R15, 40(R10)
-	ADDQ R8, DI
+	// update the bitreader structure
+	MOVQ R12, 32(R11)
+	MOVB R13, 40(R11)
+	ADDQ R9, R8
 
 	// br1.fillFast32()
-	MOVQ    32(R11), R14
-	MOVBQZX 40(R11), R15
-	CMPQ    R15, $0x20
+	MOVQ    80(R11), R12
+	MOVBQZX 88(R11), R13
+	CMPQ    R13, $0x20
 	JBE     skip_fill1
-	MOVQ    24(R11), AX
-	SUBQ    $0x20, R15
+	MOVQ    72(R11), AX
+	SUBQ    $0x20, R13
 	SUBQ    $0x04, AX
-	MOVQ    (R11), BP
+	MOVQ    48(R11), R14
 
 	// b.value |= uint64(low) << (b.bitsRead & 63)
-	MOVL (AX)(BP*1), BP
-	MOVQ R15, CX
-	SHLQ CL, BP
-	MOVQ AX, 24(R11)
-	ORQ  BP, R14
+	MOVL (AX)(R14*1), R14
+	MOVQ R13, CX
+	SHLQ CL, R14
+	MOVQ AX, 72(R11)
+	ORQ  R14, R12
 
 	// exhausted = exhausted || (br1.off < 4)
 	CMPQ  AX, $0x04
@@ -110,57 +105,57 @@ skip_fill0:
 
 skip_fill1:
 	// val0 := br1.peekTopBits(peekBits)
-	MOVQ R14, BP
-	MOVQ SI, CX
-	SHRQ CL, BP
+	MOVQ R12, R14
+	MOVQ DI, CX
+	SHRQ CL, R14
 
 	// v0 := table[val0&mask]
-	MOVW (R9)(BP*2), CX
+	MOVW (R10)(R14*2), CX
 
 	// br1.advance(uint8(v0.entry)
 	MOVB CH, AL
-	SHLQ CL, R14
-	ADDB CL, R15
+	SHLQ CL, R12
+	ADDB CL, R13
 
 	// val1 := br1.peekTopBits(peekBits)
-	MOVQ SI, CX
-	MOVQ R14, BP
-	SHRQ CL, BP
+	MOVQ DI, CX
+	MOVQ R12, R14
+	SHRQ CL, R14
 
 	// v1 := table[val1&mask]
-	MOVW (R9)(BP*2), CX
+	MOVW (R10)(R14*2), CX
 
 	// br1.advance(uint8(v1.entry))
 	MOVB CH, AH
-	SHLQ CL, R14
-	ADDB CL, R15
+	SHLQ CL, R12
+	ADDB CL, R13
 
 	// these two writes get coalesced
 	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
 	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
-	MOVW AX, (DI)
+	MOVW AX, (R8)
 
-	// update the bitrader reader structure
-	MOVQ R14, 32(R11)
-	MOVB R15, 40(R11)
-	ADDQ R8, DI
+	// update the bitreader structure
+	MOVQ R12, 80(R11)
+	MOVB R13, 88(R11)
+	ADDQ R9, R8
 
 	// br2.fillFast32()
-	MOVQ    32(R12), R14
-	MOVBQZX 40(R12), R15
-	CMPQ    R15, $0x20
+	MOVQ    128(R11), R12
+	MOVBQZX 136(R11), R13
+	CMPQ    R13, $0x20
 	JBE     skip_fill2
-	MOVQ    24(R12), AX
-	SUBQ    $0x20, R15
+	MOVQ    120(R11), AX
+	SUBQ    $0x20, R13
 	SUBQ    $0x04, AX
-	MOVQ    (R12), BP
+	MOVQ    96(R11), R14
 
 	// b.value |= uint64(low) << (b.bitsRead & 63)
-	MOVL (AX)(BP*1), BP
-	MOVQ R15, CX
-	SHLQ CL, BP
-	MOVQ AX, 24(R12)
-	ORQ  BP, R14
+	MOVL (AX)(R14*1), R14
+	MOVQ R13, CX
+	SHLQ CL, R14
+	MOVQ AX, 120(R11)
+	ORQ  R14, R12
 
 	// exhausted = exhausted || (br2.off < 4)
 	CMPQ  AX, $0x04
@@ -169,57 +164,57 @@ skip_fill1:
 
 skip_fill2:
 	// val0 := br2.peekTopBits(peekBits)
-	MOVQ R14, BP
-	MOVQ SI, CX
-	SHRQ CL, BP
+	MOVQ R12, R14
+	MOVQ DI, CX
+	SHRQ CL, R14
 
 	// v0 := table[val0&mask]
-	MOVW (R9)(BP*2), CX
+	MOVW (R10)(R14*2), CX
 
 	// br2.advance(uint8(v0.entry)
 	MOVB CH, AL
-	SHLQ CL, R14
-	ADDB CL, R15
+	SHLQ CL, R12
+	ADDB CL, R13
 
 	// val1 := br2.peekTopBits(peekBits)
-	MOVQ SI, CX
-	MOVQ R14, BP
-	SHRQ CL, BP
+	MOVQ DI, CX
+	MOVQ R12, R14
+	SHRQ CL, R14
 
 	// v1 := table[val1&mask]
-	MOVW (R9)(BP*2), CX
+	MOVW (R10)(R14*2), CX
 
 	// br2.advance(uint8(v1.entry))
 	MOVB CH, AH
-	SHLQ CL, R14
-	ADDB CL, R15
+	SHLQ CL, R12
+	ADDB CL, R13
 
 	// these two writes get coalesced
 	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
 	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
-	MOVW AX, (DI)
+	MOVW AX, (R8)
 
-	// update the bitrader reader structure
-	MOVQ R14, 32(R12)
-	MOVB R15, 40(R12)
-	ADDQ R8, DI
+	// update the bitreader structure
+	MOVQ R12, 128(R11)
+	MOVB R13, 136(R11)
+	ADDQ R9, R8
 
 	// br3.fillFast32()
-	MOVQ    32(R13), R14
-	MOVBQZX 40(R13), R15
-	CMPQ    R15, $0x20
+	MOVQ    176(R11), R12
+	MOVBQZX 184(R11), R13
+	CMPQ    R13, $0x20
 	JBE     skip_fill3
-	MOVQ    24(R13), AX
-	SUBQ    $0x20, R15
+	MOVQ    168(R11), AX
+	SUBQ    $0x20, R13
 	SUBQ    $0x04, AX
-	MOVQ    (R13), BP
+	MOVQ    144(R11), R14
 
 	// b.value |= uint64(low) << (b.bitsRead & 63)
-	MOVL (AX)(BP*1), BP
-	MOVQ R15, CX
-	SHLQ CL, BP
-	MOVQ AX, 24(R13)
-	ORQ  BP, R14
+	MOVL (AX)(R14*1), R14
+	MOVQ R13, CX
+	SHLQ CL, R14
+	MOVQ AX, 168(R11)
+	ORQ  R14, R12
 
 	// exhausted = exhausted || (br3.off < 4)
 	CMPQ  AX, $0x04
@@ -228,149 +223,142 @@ skip_fill2:
 
 skip_fill3:
 	// val0 := br3.peekTopBits(peekBits)
-	MOVQ R14, BP
-	MOVQ SI, CX
-	SHRQ CL, BP
+	MOVQ R12, R14
+	MOVQ DI, CX
+	SHRQ CL, R14
 
 	// v0 := table[val0&mask]
-	MOVW (R9)(BP*2), CX
+	MOVW (R10)(R14*2), CX
 
 	// br3.advance(uint8(v0.entry)
 	MOVB CH, AL
-	SHLQ CL, R14
-	ADDB CL, R15
+	SHLQ CL, R12
+	ADDB CL, R13
 
 	// val1 := br3.peekTopBits(peekBits)
-	MOVQ SI, CX
-	MOVQ R14, BP
-	SHRQ CL, BP
+	MOVQ DI, CX
+	MOVQ R12, R14
+	SHRQ CL, R14
 
 	// v1 := table[val1&mask]
-	MOVW (R9)(BP*2), CX
+	MOVW (R10)(R14*2), CX
 
 	// br3.advance(uint8(v1.entry))
 	MOVB CH, AH
-	SHLQ CL, R14
-	ADDB CL, R15
+	SHLQ CL, R12
+	ADDB CL, R13
 
 	// these two writes get coalesced
 	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
 	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
-	MOVW AX, (DI)
+	MOVW AX, (R8)
 
-	// update the bitrader reader structure
-	MOVQ  R14, 32(R13)
-	MOVB  R15, 40(R13)
-	ADDQ  $0x02, BX
+	// update the bitreader structure
+	MOVQ  R12, 176(R11)
+	MOVB  R13, 184(R11)
+	ADDQ  $0x02, SI
 	TESTB DL, DL
 	JZ    main_loop
 	MOVQ  ctx+0(FP), AX
-	MOVQ  40(AX), CX
-	MOVQ  BX, DX
-	SUBQ  CX, DX
-	SHLQ  $0x02, DX
-	MOVQ  DX, 64(AX)
+	SUBQ  16(AX), SI
+	SHLQ  $0x02, SI
+	MOVQ  SI, 40(AX)
 	RET
 
 // func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext)
-TEXT ·decompress4x_8b_main_loop_amd64(SB), $16-8
+TEXT ·decompress4x_8b_main_loop_amd64(SB), $0-8
 	XORQ DX, DX
 
 	// Preload values
 	MOVQ    ctx+0(FP), CX
-	MOVBQZX 32(CX), BX
-	MOVQ    40(CX), SI
-	MOVQ    SI, (SP)
-	MOVQ    72(CX), DX
-	MOVQ    DX, 8(SP)
-	MOVQ    48(CX), DI
-	MOVQ    56(CX), R8
-	MOVQ    (CX), R9
-	MOVQ    8(CX), R10
-	MOVQ    16(CX), R11
-	MOVQ    24(CX), R12
+	MOVBQZX 8(CX), DI
+	MOVQ    16(CX), BX
+	MOVQ    48(CX), SI
+	MOVQ    24(CX), R9
+	MOVQ    32(CX), R10
+	MOVQ    (CX), R11
 
 	// Main loop
 main_loop:
-	MOVQ  (SP), SI
-	CMPQ  SI, 8(SP)
+	MOVQ  BX, R8
+	CMPQ  R8, SI
 	SETGE DL
 
-	// br1000.fillFast32()
-	MOVQ    32(R9), R13
-	MOVBQZX 40(R9), R14
-	CMPQ    R14, $0x20
-	JBE     skip_fill1000
-	MOVQ    24(R9), R15
-	SUBQ    $0x20, R14
-	SUBQ    $0x04, R15
-	MOVQ    (R9), BP
+	// br0.fillFast32()
+	MOVQ    32(R11), R12
+	MOVBQZX 40(R11), R13
+	CMPQ    R13, $0x20
+	JBE     skip_fill0
+	MOVQ    24(R11), R14
+	SUBQ    $0x20, R13
+	SUBQ    $0x04, R14
+	MOVQ    (R11), R15
 
 	// b.value |= uint64(low) << (b.bitsRead & 63)
-	MOVL (R15)(BP*1), BP
-	MOVQ R14, CX
-	SHLQ CL, BP
-	MOVQ R15, 24(R9)
-	ORQ  BP, R13
-
-	// exhausted = exhausted || (br1000.off < 4)
-	CMPQ  R15, $0x04
+	MOVL (R14)(R15*1), R15
+	MOVQ R13, CX
+	SHLQ CL, R15
+	MOVQ R14, 24(R11)
+	ORQ  R15, R12
+
+	// exhausted = exhausted || (br0.off < 4)
+	CMPQ  R14, $0x04
 	SETLT AL
 	ORB   AL, DL
 
-skip_fill1000:
+skip_fill0:
 	// val0 := br0.peekTopBits(peekBits)
-	MOVQ R13, R15
-	MOVQ BX, CX
-	SHRQ CL, R15
+	MOVQ R12, R14
+	MOVQ DI, CX
+	SHRQ CL, R14
 
 	// v0 := table[val0&mask]
-	MOVW (R8)(R15*2), CX
+	MOVW (R10)(R14*2), CX
 
 	// br0.advance(uint8(v0.entry)
 	MOVB CH, AL
-	SHLQ CL, R13
-	ADDB CL, R14
+	SHLQ CL, R12
+	ADDB CL, R13
 
 	// val1 := br0.peekTopBits(peekBits)
-	MOVQ R13, R15
-	MOVQ BX, CX
-	SHRQ CL, R15
+	MOVQ R12, R14
+	MOVQ DI, CX
+	SHRQ CL, R14
 
 	// v1 := table[val0&mask]
-	MOVW (R8)(R15*2), CX
+	MOVW (R10)(R14*2), CX
 
 	// br0.advance(uint8(v1.entry)
 	MOVB   CH, AH
-	SHLQ   CL, R13
-	ADDB   CL, R14
+	SHLQ   CL, R12
+	ADDB   CL, R13
 	BSWAPL AX
 
 	// val2 := br0.peekTopBits(peekBits)
-	MOVQ R13, R15
-	MOVQ BX, CX
-	SHRQ CL, R15
+	MOVQ R12, R14
+	MOVQ DI, CX
+	SHRQ CL, R14
 
 	// v2 := table[val0&mask]
-	MOVW (R8)(R15*2), CX
+	MOVW (R10)(R14*2), CX
 
 	// br0.advance(uint8(v2.entry)
 	MOVB CH, AH
-	SHLQ CL, R13
-	ADDB CL, R14
+	SHLQ CL, R12
+	ADDB CL, R13
 
 	// val3 := br0.peekTopBits(peekBits)
-	MOVQ R13, R15
-	MOVQ BX, CX
-	SHRQ CL, R15
+	MOVQ R12, R14
+	MOVQ DI, CX
+	SHRQ CL, R14
 
 	// v3 := table[val0&mask]
-	MOVW (R8)(R15*2), CX
+	MOVW (R10)(R14*2), CX
 
 	// br0.advance(uint8(v3.entry)
 	MOVB   CH, AL
-	SHLQ   CL, R13
-	ADDB   CL, R14
+	SHLQ   CL, R12
+	ADDB   CL, R13
 	BSWAPL AX
 
 	// these four writes get coalesced
@@ -378,88 +366,88 @@ skip_fill1000:
 	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
 	// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
 	// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
-	MOVL AX, (SI)
-
-	// update the bitreader reader structure
-	MOVQ R13, 32(R9)
-	MOVB R14, 40(R9)
-	ADDQ DI, SI
-
-	// br1001.fillFast32()
-	MOVQ    32(R10), R13
-	MOVBQZX 40(R10), R14
-	CMPQ    R14, $0x20
-	JBE     skip_fill1001
-	MOVQ    24(R10), R15
-	SUBQ    $0x20, R14
-	SUBQ    $0x04, R15
-	MOVQ    (R10), BP
+	MOVL AX, (R8)
+
+	// update the bitreader structure
+	MOVQ R12, 32(R11)
+	MOVB R13, 40(R11)
+	ADDQ R9, R8
+
+	// br1.fillFast32()
+	MOVQ    80(R11), R12
+	MOVBQZX 88(R11), R13
+	CMPQ    R13, $0x20
+	JBE     skip_fill1
+	MOVQ    72(R11), R14
+	SUBQ    $0x20, R13
+	SUBQ    $0x04, R14
+	MOVQ    48(R11), R15
 
 	// b.value |= uint64(low) << (b.bitsRead & 63)
-	MOVL (R15)(BP*1), BP
-	MOVQ R14, CX
-	SHLQ CL, BP
-	MOVQ R15, 24(R10)
-	ORQ  BP, R13
-
-	// exhausted = exhausted || (br1001.off < 4)
-	CMPQ  R15, $0x04
+	MOVL (R14)(R15*1), R15
+	MOVQ R13, CX
+	SHLQ CL, R15
+	MOVQ R14, 72(R11)
+	ORQ  R15, R12
+
+	// exhausted = exhausted || (br1.off < 4)
+	CMPQ  R14, $0x04
 	SETLT AL
 	ORB   AL, DL
 
-skip_fill1001:
+skip_fill1:
 	// val0 := br1.peekTopBits(peekBits)
-	MOVQ R13, R15
-	MOVQ BX, CX
-	SHRQ CL, R15
+	MOVQ R12, R14
+	MOVQ DI, CX
+	SHRQ CL, R14
 
 	// v0 := table[val0&mask]
-	MOVW (R8)(R15*2), CX
+	MOVW (R10)(R14*2), CX
 
 	// br1.advance(uint8(v0.entry)
 	MOVB CH, AL
-	SHLQ CL, R13
-	ADDB CL, R14
+	SHLQ CL, R12
+	ADDB CL, R13
 
 	// val1 := br1.peekTopBits(peekBits)
-	MOVQ R13, R15
-	MOVQ BX, CX
-	SHRQ CL, R15
+	MOVQ R12, R14
+	MOVQ DI, CX
+	SHRQ CL, R14
 
 	// v1 := table[val0&mask]
-	MOVW (R8)(R15*2), CX
+	MOVW (R10)(R14*2), CX
 
 	// br1.advance(uint8(v1.entry)
 	MOVB   CH, AH
-	SHLQ   CL, R13
-	ADDB   CL, R14
+	SHLQ   CL, R12
+	ADDB   CL, R13
 	BSWAPL AX
 
 	// val2 := br1.peekTopBits(peekBits)
-	MOVQ R13, R15
-	MOVQ BX, CX
-	SHRQ CL, R15
+	MOVQ R12, R14
+	MOVQ DI, CX
+	SHRQ CL, R14
 
 	// v2 := table[val0&mask]
-	MOVW (R8)(R15*2), CX
+	MOVW (R10)(R14*2), CX
 
 	// br1.advance(uint8(v2.entry)
 	MOVB CH, AH
-	SHLQ CL, R13
-	ADDB CL, R14
+	SHLQ CL, R12
+	ADDB CL, R13
 
 	// val3 := br1.peekTopBits(peekBits)
-	MOVQ R13, R15
-	MOVQ BX, CX
-	SHRQ CL, R15
+	MOVQ R12, R14
+	MOVQ DI, CX
+	SHRQ CL, R14
 
 	// v3 := table[val0&mask]
-	MOVW (R8)(R15*2), CX
+	MOVW (R10)(R14*2), CX
 
 	// br1.advance(uint8(v3.entry)
 	MOVB   CH, AL
-	SHLQ   CL, R13
-	ADDB   CL, R14
+	SHLQ   CL, R12
+	ADDB   CL, R13
 	BSWAPL AX
 
 	// these four writes get coalesced
@@ -467,88 +455,88 @@ skip_fill1001:
 	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
 	// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
 	// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
-	MOVL AX, (SI)
-
-	// update the bitreader reader structure
-	MOVQ R13, 32(R10)
-	MOVB R14, 40(R10)
-	ADDQ DI, SI
-
-	// br1002.fillFast32()
-	MOVQ    32(R11), R13
-	MOVBQZX 40(R11), R14
-	CMPQ    R14, $0x20
-	JBE     skip_fill1002
-	MOVQ    24(R11), R15
-	SUBQ    $0x20, R14
-	SUBQ    $0x04, R15
-	MOVQ    (R11), BP
+	MOVL AX, (R8)
+
+	// update the bitreader structure
+	MOVQ R12, 80(R11)
+	MOVB R13, 88(R11)
+	ADDQ R9, R8
+
+	// br2.fillFast32()
+	MOVQ    128(R11), R12
+	MOVBQZX 136(R11), R13
+	CMPQ    R13, $0x20
+	JBE     skip_fill2
+	MOVQ    120(R11), R14
+	SUBQ    $0x20, R13
+	SUBQ    $0x04, R14
+	MOVQ    96(R11), R15
 
 	// b.value |= uint64(low) << (b.bitsRead & 63)
-	MOVL (R15)(BP*1), BP
-	MOVQ R14, CX
-	SHLQ CL, BP
-	MOVQ R15, 24(R11)
-	ORQ  BP, R13
-
-	// exhausted = exhausted || (br1002.off < 4)
-	CMPQ  R15, $0x04
+	MOVL (R14)(R15*1), R15
+	MOVQ R13, CX
+	SHLQ CL, R15
+	MOVQ R14, 120(R11)
+	ORQ  R15, R12
+
+	// exhausted = exhausted || (br2.off < 4)
+	CMPQ  R14, $0x04
 	SETLT AL
 	ORB   AL, DL
 
-skip_fill1002:
+skip_fill2:
 	// val0 := br2.peekTopBits(peekBits)
-	MOVQ R13, R15
-	MOVQ BX, CX
-	SHRQ CL, R15
+	MOVQ R12, R14
+	MOVQ DI, CX
+	SHRQ CL, R14
 
 	// v0 := table[val0&mask]
-	MOVW (R8)(R15*2), CX
+	MOVW (R10)(R14*2), CX
 
 	// br2.advance(uint8(v0.entry)
 	MOVB CH, AL
-	SHLQ CL, R13
-	ADDB CL, R14
+	SHLQ CL, R12
+	ADDB CL, R13
 
 	// val1 := br2.peekTopBits(peekBits)
-	MOVQ R13, R15
-	MOVQ BX, CX
-	SHRQ CL, R15
+	MOVQ R12, R14
+	MOVQ DI, CX
+	SHRQ CL, R14
 
 	// v1 := table[val0&mask]
-	MOVW (R8)(R15*2), CX
+	MOVW (R10)(R14*2), CX
 
 	// br2.advance(uint8(v1.entry)
 	MOVB   CH, AH
-	SHLQ   CL, R13
-	ADDB   CL, R14
+	SHLQ   CL, R12
+	ADDB   CL, R13
 	BSWAPL AX
 
 	// val2 := br2.peekTopBits(peekBits)
-	MOVQ R13, R15
-	MOVQ BX, CX
-	SHRQ CL, R15
+	MOVQ R12, R14
+	MOVQ DI, CX
+	SHRQ CL, R14
 
 	// v2 := table[val0&mask]
-	MOVW (R8)(R15*2), CX
+	MOVW (R10)(R14*2), CX
 
 	// br2.advance(uint8(v2.entry)
 	MOVB CH, AH
-	SHLQ CL, R13
-	ADDB CL, R14
+	SHLQ CL, R12
+	ADDB CL, R13
 
 	// val3 := br2.peekTopBits(peekBits)
-	MOVQ R13, R15
-	MOVQ BX, CX
-	SHRQ CL, R15
+	MOVQ R12, R14
+	MOVQ DI, CX
+	SHRQ CL, R14
 
 	// v3 := table[val0&mask]
-	MOVW (R8)(R15*2), CX
+	MOVW (R10)(R14*2), CX
 
 	// br2.advance(uint8(v3.entry)
 	MOVB   CH, AL
-	SHLQ   CL, R13
-	ADDB   CL, R14
+	SHLQ   CL, R12
+	ADDB   CL, R13
 	BSWAPL AX
 
 	// these four writes get coalesced
@@ -556,88 +544,88 @@ skip_fill1002:
 	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
 	// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
 	// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
-	MOVL AX, (SI)
-
-	// update the bitreader reader structure
-	MOVQ R13, 32(R11)
-	MOVB R14, 40(R11)
-	ADDQ DI, SI
-
-	// br1003.fillFast32()
-	MOVQ    32(R12), R13
-	MOVBQZX 40(R12), R14
-	CMPQ    R14, $0x20
-	JBE     skip_fill1003
-	MOVQ    24(R12), R15
-	SUBQ    $0x20, R14
-	SUBQ    $0x04, R15
-	MOVQ    (R12), BP
+	MOVL AX, (R8)
+
+	// update the bitreader structure
+	MOVQ R12, 128(R11)
+	MOVB R13, 136(R11)
+	ADDQ R9, R8
+
+	// br3.fillFast32()
+	MOVQ    176(R11), R12
+	MOVBQZX 184(R11), R13
+	CMPQ    R13, $0x20
+	JBE     skip_fill3
+	MOVQ    168(R11), R14
+	SUBQ    $0x20, R13
+	SUBQ    $0x04, R14
+	MOVQ    144(R11), R15
 
 	// b.value |= uint64(low) << (b.bitsRead & 63)
-	MOVL (R15)(BP*1), BP
-	MOVQ R14, CX
-	SHLQ CL, BP
-	MOVQ R15, 24(R12)
-	ORQ  BP, R13
-
-	// exhausted = exhausted || (br1003.off < 4)
-	CMPQ  R15, $0x04
+	MOVL (R14)(R15*1), R15
+	MOVQ R13, CX
+	SHLQ CL, R15
+	MOVQ R14, 168(R11)
+	ORQ  R15, R12
+
+	// exhausted = exhausted || (br3.off < 4)
+	CMPQ  R14, $0x04
 	SETLT AL
 	ORB   AL, DL
 
-skip_fill1003:
+skip_fill3:
 	// val0 := br3.peekTopBits(peekBits)
-	MOVQ R13, R15
-	MOVQ BX, CX
-	SHRQ CL, R15
+	MOVQ R12, R14
+	MOVQ DI, CX
+	SHRQ CL, R14
 
 	// v0 := table[val0&mask]
-	MOVW (R8)(R15*2), CX
+	MOVW (R10)(R14*2), CX
 
 	// br3.advance(uint8(v0.entry)
 	MOVB CH, AL
-	SHLQ CL, R13
-	ADDB CL, R14
+	SHLQ CL, R12
+	ADDB CL, R13
 
 	// val1 := br3.peekTopBits(peekBits)
-	MOVQ R13, R15
-	MOVQ BX, CX
-	SHRQ CL, R15
+	MOVQ R12, R14
+	MOVQ DI, CX
+	SHRQ CL, R14
 
 	// v1 := table[val0&mask]
-	MOVW (R8)(R15*2), CX
+	MOVW (R10)(R14*2), CX
 
 	// br3.advance(uint8(v1.entry)
 	MOVB   CH, AH
-	SHLQ   CL, R13
-	ADDB   CL, R14
+	SHLQ   CL, R12
+	ADDB   CL, R13
 	BSWAPL AX
 
 	// val2 := br3.peekTopBits(peekBits)
-	MOVQ R13, R15
-	MOVQ BX, CX
-	SHRQ CL, R15
+	MOVQ R12, R14
+	MOVQ DI, CX
+	SHRQ CL, R14
 
 	// v2 := table[val0&mask]
-	MOVW (R8)(R15*2), CX
+	MOVW (R10)(R14*2), CX
 
 	// br3.advance(uint8(v2.entry)
 	MOVB CH, AH
-	SHLQ CL, R13
-	ADDB CL, R14
+	SHLQ CL, R12
+	ADDB CL, R13
 
 	// val3 := br3.peekTopBits(peekBits)
-	MOVQ R13, R15
-	MOVQ BX, CX
-	SHRQ CL, R15
+	MOVQ R12, R14
+	MOVQ DI, CX
+	SHRQ CL, R14
 
 	// v3 := table[val0&mask]
-	MOVW (R8)(R15*2), CX
+	MOVW (R10)(R14*2), CX
 
 	// br3.advance(uint8(v3.entry)
 	MOVB   CH, AL
-	SHLQ   CL, R13
-	ADDB   CL, R14
+	SHLQ   CL, R12
+	ADDB   CL, R13
 	BSWAPL AX
 
 	// these four writes get coalesced
@@ -645,20 +633,18 @@ skip_fill1003:
 	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
 	// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
 	// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
-	MOVL AX, (SI)
+	MOVL AX, (R8)
 
-	// update the bitreader reader structure
-	MOVQ  R13, 32(R12)
-	MOVB  R14, 40(R12)
-	ADDQ  $0x04, (SP)
+	// update the bitreader structure
+	MOVQ  R12, 176(R11)
+	MOVB  R13, 184(R11)
+	ADDQ  $0x04, BX
 	TESTB DL, DL
 	JZ    main_loop
 	MOVQ  ctx+0(FP), AX
-	MOVQ  40(AX), CX
-	MOVQ  (SP), DX
-	SUBQ  CX, DX
-	SHLQ  $0x02, DX
-	MOVQ  DX, 64(AX)
+	SUBQ  16(AX), BX
+	SHLQ  $0x02, BX
+	MOVQ  BX, 40(AX)
 	RET
 
 // func decompress1x_main_loop_amd64(ctx *decompress1xContext)
@@ -750,10 +736,8 @@ loop_condition:
 
 	// Update ctx structure
 	MOVQ ctx+0(FP), AX
-	MOVQ DX, CX
-	MOVQ 16(AX), DX
-	SUBQ DX, CX
-	MOVQ CX, 40(AX)
+	SUBQ 16(AX), DX
+	MOVQ DX, 40(AX)
 	MOVQ (AX), AX
 	MOVQ R9, 24(AX)
 	MOVQ R10, 32(AX)
@@ -847,10 +831,8 @@ loop_condition:
 
 	// Update ctx structure
 	MOVQ ctx+0(FP), AX
-	MOVQ DX, CX
-	MOVQ 16(AX), DX
-	SUBQ DX, CX
-	MOVQ CX, 40(AX)
+	SUBQ 16(AX), DX
+	MOVQ DX, 40(AX)
 	MOVQ (AX), AX
 	MOVQ R9, 24(AX)
 	MOVQ R10, 32(AX)
diff --git a/vendor/github.com/klauspost/compress/s2/decode.go b/vendor/github.com/klauspost/compress/s2/decode.go
index b5fa4d3f0..27c0f3c2c 100644
--- a/vendor/github.com/klauspost/compress/s2/decode.go
+++ b/vendor/github.com/klauspost/compress/s2/decode.go
@@ -11,6 +11,7 @@ import (
 	"fmt"
 	"io"
 	"io/ioutil"
+	"math"
 	"runtime"
 	"sync"
 )
@@ -719,7 +720,11 @@ func (r *Reader) Skip(n int64) error {
 			// decoded[i:j] contains decoded bytes that have not yet been passed on.
 			left := int64(r.j - r.i)
 			if left >= n {
-				r.i += int(n)
+				tmp := int64(r.i) + n
+				if tmp > math.MaxInt32 {
+					return errors.New("s2: internal overflow in skip")
+				}
+				r.i = int(tmp)
 				return nil
 			}
 			n -= int64(r.j - r.i)
diff --git a/vendor/github.com/klauspost/compress/s2/index.go b/vendor/github.com/klauspost/compress/s2/index.go
index 7b24a0060..dd9ecfe71 100644
--- a/vendor/github.com/klauspost/compress/s2/index.go
+++ b/vendor/github.com/klauspost/compress/s2/index.go
@@ -533,3 +533,66 @@ func (i *Index) JSON() []byte {
 	b, _ := json.MarshalIndent(x, "", "  ")
 	return b
 }
+
+// RemoveIndexHeaders will trim all headers and trailers from a given index.
+// This is expected to save 20 bytes.
+// These can be restored using RestoreIndexHeaders.
+// This removes a layer of security, but is the most compact representation.
+// Returns nil if headers contains errors.
+// The returned slice references the provided slice.
+func RemoveIndexHeaders(b []byte) []byte {
+	const save = 4 + len(S2IndexHeader) + len(S2IndexTrailer) + 4
+	if len(b) <= save {
+		return nil
+	}
+	if b[0] != ChunkTypeIndex {
+		return nil
+	}
+	chunkLen := int(b[1]) | int(b[2])<<8 | int(b[3])<<16
+	b = b[4:]
+
+	// Validate we have enough...
+	if len(b) < chunkLen {
+		return nil
+	}
+	b = b[:chunkLen]
+
+	if !bytes.Equal(b[:len(S2IndexHeader)], []byte(S2IndexHeader)) {
+		return nil
+	}
+	b = b[len(S2IndexHeader):]
+	if !bytes.HasSuffix(b, []byte(S2IndexTrailer)) {
+		return nil
+	}
+	b = bytes.TrimSuffix(b, []byte(S2IndexTrailer))
+
+	if len(b) < 4 {
+		return nil
+	}
+	return b[:len(b)-4]
+}
+
+// RestoreIndexHeaders will index restore headers removed by RemoveIndexHeaders.
+// No error checking is performed on the input.
+// If a 0 length slice is sent, it is returned without modification.
+func RestoreIndexHeaders(in []byte) []byte {
+	if len(in) == 0 {
+		return in
+	}
+	b := make([]byte, 0, 4+len(S2IndexHeader)+len(in)+len(S2IndexTrailer)+4)
+	b = append(b, ChunkTypeIndex, 0, 0, 0)
+	b = append(b, []byte(S2IndexHeader)...)
+	b = append(b, in...)
+
+	var tmp [4]byte
+	binary.LittleEndian.PutUint32(tmp[:], uint32(len(b)+4+len(S2IndexTrailer)))
+	b = append(b, tmp[:4]...)
+	// Trailer
+	b = append(b, []byte(S2IndexTrailer)...)
+
+	chunkLen := len(b) - skippableFrameHeader
+	b[1] = uint8(chunkLen >> 0)
+	b[2] = uint8(chunkLen >> 8)
+	b[3] = uint8(chunkLen >> 16)
+	return b
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/bytebuf.go b/vendor/github.com/klauspost/compress/zstd/bytebuf.go
index 4493baa75..2ad02070d 100644
--- a/vendor/github.com/klauspost/compress/zstd/bytebuf.go
+++ b/vendor/github.com/klauspost/compress/zstd/bytebuf.go
@@ -23,7 +23,7 @@ type byteBuffer interface {
 	readByte() (byte, error)
 
 	// Skip n bytes.
-	skipN(n int) error
+	skipN(n int64) error
 }
 
 // in-memory buffer
@@ -62,9 +62,12 @@ func (b *byteBuf) readByte() (byte, error) {
 	return r, nil
 }
 
-func (b *byteBuf) skipN(n int) error {
+func (b *byteBuf) skipN(n int64) error {
 	bb := *b
-	if len(bb) < n {
+	if n < 0 {
+		return fmt.Errorf("negative skip (%d) requested", n)
+	}
+	if int64(len(bb)) < n {
 		return io.ErrUnexpectedEOF
 	}
 	*b = bb[n:]
@@ -120,9 +123,9 @@ func (r *readerWrapper) readByte() (byte, error) {
 	return r.tmp[0], nil
 }
 
-func (r *readerWrapper) skipN(n int) error {
-	n2, err := io.CopyN(ioutil.Discard, r.r, int64(n))
-	if n2 != int64(n) {
+func (r *readerWrapper) skipN(n int64) error {
+	n2, err := io.CopyN(ioutil.Discard, r.r, n)
+	if n2 != n {
 		err = io.ErrUnexpectedEOF
 	}
 	return err
diff --git a/vendor/github.com/klauspost/compress/zstd/decoder.go b/vendor/github.com/klauspost/compress/zstd/decoder.go
index 286c8f9d7..d212f4737 100644
--- a/vendor/github.com/klauspost/compress/zstd/decoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/decoder.go
@@ -348,6 +348,9 @@ func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) {
 			frame.history.setDict(&dict)
 		}
 		if frame.WindowSize > d.o.maxWindowSize {
+			if debugDecoder {
+				println("window size exceeded:", frame.WindowSize, ">", d.o.maxWindowSize)
+			}
 			return dst, ErrWindowSizeExceeded
 		}
 		if frame.FrameContentSize != fcsUnknown {
diff --git a/vendor/github.com/klauspost/compress/zstd/encoder.go b/vendor/github.com/klauspost/compress/zstd/encoder.go
index e6b1d01cf..7aaaedb23 100644
--- a/vendor/github.com/klauspost/compress/zstd/encoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/encoder.go
@@ -528,8 +528,8 @@ func (e *Encoder) EncodeAll(src, dst []byte) []byte {
 		// If a non-single block is needed the encoder will reset again.
 		e.encoders <- enc
 	}()
-	// Use single segments when above minimum window and below 1MB.
-	single := len(src) < 1<<20 && len(src) > MinWindowSize
+	// Use single segments when above minimum window and below window size.
+	single := len(src) <= e.o.windowSize && len(src) > MinWindowSize
 	if e.o.single != nil {
 		single = *e.o.single
 	}
diff --git a/vendor/github.com/klauspost/compress/zstd/encoder_options.go b/vendor/github.com/klauspost/compress/zstd/encoder_options.go
index 44d8dbd19..a7c5e1aac 100644
--- a/vendor/github.com/klauspost/compress/zstd/encoder_options.go
+++ b/vendor/github.com/klauspost/compress/zstd/encoder_options.go
@@ -283,7 +283,7 @@ func WithNoEntropyCompression(b bool) EOption {
 // a decoder is allowed to reject a compressed frame which requests a memory size beyond decoder's authorized range.
 // For broader compatibility, decoders are recommended to support memory sizes of at least 8 MB.
 // This is only a recommendation, each decoder is free to support higher or lower limits, depending on local limitations.
-// If this is not specified, block encodes will automatically choose this based on the input size.
+// If this is not specified, block encodes will automatically choose this based on the input size and the window size.
 // This setting has no effect on streamed encodes.
 func WithSingleSegment(b bool) EOption {
 	return func(o *encoderOptions) error {
diff --git a/vendor/github.com/klauspost/compress/zstd/framedec.go b/vendor/github.com/klauspost/compress/zstd/framedec.go
index fa0a633f3..9568a4ba3 100644
--- a/vendor/github.com/klauspost/compress/zstd/framedec.go
+++ b/vendor/github.com/klauspost/compress/zstd/framedec.go
@@ -106,7 +106,7 @@ func (d *frameDec) reset(br byteBuffer) error {
 		}
 		n := uint32(b[0]) | (uint32(b[1]) << 8) | (uint32(b[2]) << 16) | (uint32(b[3]) << 24)
 		println("Skipping frame with", n, "bytes.")
-		err = br.skipN(int(n))
+		err = br.skipN(int64(n))
 		if err != nil {
 			if debugDecoder {
 				println("Reading discarded frame", err)
@@ -231,20 +231,27 @@ func (d *frameDec) reset(br byteBuffer) error {
 		d.crc.Reset()
 	}
 
+	if d.WindowSize > d.o.maxWindowSize {
+		if debugDecoder {
+			printf("window size %d > max %d\n", d.WindowSize, d.o.maxWindowSize)
+		}
+		return ErrWindowSizeExceeded
+	}
+
 	if d.WindowSize == 0 && d.SingleSegment {
 		// We may not need window in this case.
 		d.WindowSize = d.FrameContentSize
 		if d.WindowSize < MinWindowSize {
 			d.WindowSize = MinWindowSize
 		}
-	}
-
-	if d.WindowSize > uint64(d.o.maxWindowSize) {
-		if debugDecoder {
-			printf("window size %d > max %d\n", d.WindowSize, d.o.maxWindowSize)
+		if d.WindowSize > d.o.maxDecodedSize {
+			if debugDecoder {
+				printf("window size %d > max %d\n", d.WindowSize, d.o.maxWindowSize)
+			}
+			return ErrDecoderSizeExceeded
 		}
-		return ErrWindowSizeExceeded
 	}
+
 	// The minimum Window_Size is 1 KB.
 	if d.WindowSize < MinWindowSize {
 		if debugDecoder {
diff --git a/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.go b/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.go
index e74df436c..c881d28d8 100644
--- a/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.go
+++ b/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.go
@@ -34,8 +34,8 @@ const (
 // buildDtable will build the decoding table.
 func (s *fseDecoder) buildDtable() error {
 	ctx := buildDtableAsmContext{
-		stateTable: (*uint16)(&s.stateTable[0]),
-		norm:       (*int16)(&s.norm[0]),
+		stateTable: &s.stateTable[0],
+		norm:       &s.norm[0],
 		dt:         (*uint64)(&s.dt[0]),
 	}
 	code := buildDtable_asm(s, &ctx)
diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.go b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.go
index 847b322ae..7598c1018 100644
--- a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.go
+++ b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.go
@@ -55,16 +55,22 @@ func (s *sequenceDecs) decodeSyncSimple(hist []byte) (bool, error) {
 	if s.maxSyncLen == 0 && cap(s.out)-len(s.out) < maxCompressedBlockSize {
 		return false, nil
 	}
-	useSafe := false
-	if s.maxSyncLen == 0 && cap(s.out)-len(s.out) < maxCompressedBlockSizeAlloc {
-		useSafe = true
-	}
-	if s.maxSyncLen > 0 && cap(s.out)-len(s.out)-compressedBlockOverAlloc < int(s.maxSyncLen) {
-		useSafe = true
-	}
-	if cap(s.literals) < len(s.literals)+compressedBlockOverAlloc {
-		useSafe = true
-	}
+
+	// FIXME: Using unsafe memory copies leads to rare, random crashes
+	// with fuzz testing. It is therefore disabled for now.
+	const useSafe = true
+	/*
+		useSafe := false
+		if s.maxSyncLen == 0 && cap(s.out)-len(s.out) < maxCompressedBlockSizeAlloc {
+			useSafe = true
+		}
+		if s.maxSyncLen > 0 && cap(s.out)-len(s.out)-compressedBlockOverAlloc < int(s.maxSyncLen) {
+			useSafe = true
+		}
+		if cap(s.literals) < len(s.literals)+compressedBlockOverAlloc {
+			useSafe = true
+		}
+	*/
 
 	br := s.br
 
diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s
index 212c6cac3..27e76774c 100644
--- a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s
+++ b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s
@@ -52,34 +52,46 @@ sequenceDecs_decode_amd64_fill_byte_by_byte:
 
 sequenceDecs_decode_amd64_fill_end:
 	// Update offset
-	MOVQ    R9, AX
-	MOVQ    BX, CX
-	MOVQ    DX, R15
-	SHLQ    CL, R15
-	MOVB    AH, CL
-	ADDQ    CX, BX
-	NEGL    CX
-	SHRQ    CL, R15
-	SHRQ    $0x20, AX
-	TESTQ   CX, CX
-	CMOVQEQ CX, R15
-	ADDQ    R15, AX
-	MOVQ    AX, 16(R10)
+	MOVQ  R9, AX
+	MOVQ  BX, CX
+	MOVQ  DX, R15
+	SHLQ  CL, R15
+	MOVB  AH, CL
+	SHRQ  $0x20, AX
+	TESTQ CX, CX
+	JZ    sequenceDecs_decode_amd64_of_update_zero
+	ADDQ  CX, BX
+	CMPQ  BX, $0x40
+	JA    sequenceDecs_decode_amd64_of_update_zero
+	CMPQ  CX, $0x40
+	JAE   sequenceDecs_decode_amd64_of_update_zero
+	NEGQ  CX
+	SHRQ  CL, R15
+	ADDQ  R15, AX
+
+sequenceDecs_decode_amd64_of_update_zero:
+	MOVQ AX, 16(R10)
 
 	// Update match length
-	MOVQ    R8, AX
-	MOVQ    BX, CX
-	MOVQ    DX, R15
-	SHLQ    CL, R15
-	MOVB    AH, CL
-	ADDQ    CX, BX
-	NEGL    CX
-	SHRQ    CL, R15
-	SHRQ    $0x20, AX
-	TESTQ   CX, CX
-	CMOVQEQ CX, R15
-	ADDQ    R15, AX
-	MOVQ    AX, 8(R10)
+	MOVQ  R8, AX
+	MOVQ  BX, CX
+	MOVQ  DX, R15
+	SHLQ  CL, R15
+	MOVB  AH, CL
+	SHRQ  $0x20, AX
+	TESTQ CX, CX
+	JZ    sequenceDecs_decode_amd64_ml_update_zero
+	ADDQ  CX, BX
+	CMPQ  BX, $0x40
+	JA    sequenceDecs_decode_amd64_ml_update_zero
+	CMPQ  CX, $0x40
+	JAE   sequenceDecs_decode_amd64_ml_update_zero
+	NEGQ  CX
+	SHRQ  CL, R15
+	ADDQ  R15, AX
+
+sequenceDecs_decode_amd64_ml_update_zero:
+	MOVQ AX, 8(R10)
 
 	// Fill bitreader to have enough for the remaining
 	CMPQ SI, $0x08
@@ -107,19 +119,25 @@ sequenceDecs_decode_amd64_fill_2_byte_by_byte:
 
 sequenceDecs_decode_amd64_fill_2_end:
 	// Update literal length
-	MOVQ    DI, AX
-	MOVQ    BX, CX
-	MOVQ    DX, R15
-	SHLQ    CL, R15
-	MOVB    AH, CL
-	ADDQ    CX, BX
-	NEGL    CX
-	SHRQ    CL, R15
-	SHRQ    $0x20, AX
-	TESTQ   CX, CX
-	CMOVQEQ CX, R15
-	ADDQ    R15, AX
-	MOVQ    AX, (R10)
+	MOVQ  DI, AX
+	MOVQ  BX, CX
+	MOVQ  DX, R15
+	SHLQ  CL, R15
+	MOVB  AH, CL
+	SHRQ  $0x20, AX
+	TESTQ CX, CX
+	JZ    sequenceDecs_decode_amd64_ll_update_zero
+	ADDQ  CX, BX
+	CMPQ  BX, $0x40
+	JA    sequenceDecs_decode_amd64_ll_update_zero
+	CMPQ  CX, $0x40
+	JAE   sequenceDecs_decode_amd64_ll_update_zero
+	NEGQ  CX
+	SHRQ  CL, R15
+	ADDQ  R15, AX
+
+sequenceDecs_decode_amd64_ll_update_zero:
+	MOVQ AX, (R10)
 
 	// Fill bitreader for state updates
 	MOVQ    R14, (SP)
@@ -134,18 +152,17 @@ sequenceDecs_decode_amd64_fill_2_end:
 	MOVBQZX DI, R14
 	SHRQ    $0x10, DI
 	MOVWQZX DI, DI
-	CMPQ    R14, $0x00
-	JZ      sequenceDecs_decode_amd64_llState_updateState_skip_zero
-	MOVQ    BX, CX
-	ADDQ    R14, BX
+	LEAQ    (BX)(R14*1), CX
 	MOVQ    DX, R15
-	SHLQ    CL, R15
-	MOVQ    R14, CX
-	NEGQ    CX
-	SHRQ    CL, R15
+	MOVQ    CX, BX
+	ROLQ    CL, R15
+	MOVL    $0x00000001, BP
+	MOVB    R14, CL
+	SHLL    CL, BP
+	DECL    BP
+	ANDQ    BP, R15
 	ADDQ    R15, DI
 
-sequenceDecs_decode_amd64_llState_updateState_skip_zero:
 	// Load ctx.llTable
 	MOVQ ctx+16(FP), CX
 	MOVQ (CX), CX
@@ -155,18 +172,17 @@ sequenceDecs_decode_amd64_llState_updateState_skip_zero:
 	MOVBQZX R8, R14
 	SHRQ    $0x10, R8
 	MOVWQZX R8, R8
-	CMPQ    R14, $0x00
-	JZ      sequenceDecs_decode_amd64_mlState_updateState_skip_zero
-	MOVQ    BX, CX
-	ADDQ    R14, BX
+	LEAQ    (BX)(R14*1), CX
 	MOVQ    DX, R15
-	SHLQ    CL, R15
-	MOVQ    R14, CX
-	NEGQ    CX
-	SHRQ    CL, R15
+	MOVQ    CX, BX
+	ROLQ    CL, R15
+	MOVL    $0x00000001, BP
+	MOVB    R14, CL
+	SHLL    CL, BP
+	DECL    BP
+	ANDQ    BP, R15
 	ADDQ    R15, R8
 
-sequenceDecs_decode_amd64_mlState_updateState_skip_zero:
 	// Load ctx.mlTable
 	MOVQ ctx+16(FP), CX
 	MOVQ 24(CX), CX
@@ -176,18 +192,17 @@ sequenceDecs_decode_amd64_mlState_updateState_skip_zero:
 	MOVBQZX R9, R14
 	SHRQ    $0x10, R9
 	MOVWQZX R9, R9
-	CMPQ    R14, $0x00
-	JZ      sequenceDecs_decode_amd64_ofState_updateState_skip_zero
-	MOVQ    BX, CX
-	ADDQ    R14, BX
+	LEAQ    (BX)(R14*1), CX
 	MOVQ    DX, R15
-	SHLQ    CL, R15
-	MOVQ    R14, CX
-	NEGQ    CX
-	SHRQ    CL, R15
+	MOVQ    CX, BX
+	ROLQ    CL, R15
+	MOVL    $0x00000001, BP
+	MOVB    R14, CL
+	SHLL    CL, BP
+	DECL    BP
+	ANDQ    BP, R15
 	ADDQ    R15, R9
 
-sequenceDecs_decode_amd64_ofState_updateState_skip_zero:
 	// Load ctx.ofTable
 	MOVQ ctx+16(FP), CX
 	MOVQ 48(CX), CX
@@ -201,7 +216,7 @@ sequenceDecs_decode_amd64_skip_update:
 	MOVQ R12, R13
 	MOVQ R11, R12
 	MOVQ CX, R11
-	JMP  sequenceDecs_decode_amd64_adjust_end
+	JMP  sequenceDecs_decode_amd64_after_adjust
 
 sequenceDecs_decode_amd64_adjust_offsetB_1_or_0:
 	CMPQ (R10), $0x00000000
@@ -213,7 +228,7 @@ sequenceDecs_decode_amd64_adjust_offset_maybezero:
 	TESTQ CX, CX
 	JNZ   sequenceDecs_decode_amd64_adjust_offset_nonzero
 	MOVQ  R11, CX
-	JMP   sequenceDecs_decode_amd64_adjust_end
+	JMP   sequenceDecs_decode_amd64_after_adjust
 
 sequenceDecs_decode_amd64_adjust_offset_nonzero:
 	CMPQ CX, $0x01
@@ -250,7 +265,7 @@ sequenceDecs_decode_amd64_adjust_temp_valid:
 	MOVQ    AX, R11
 	MOVQ    AX, CX
 
-sequenceDecs_decode_amd64_adjust_end:
+sequenceDecs_decode_amd64_after_adjust:
 	MOVQ CX, 16(R10)
 
 	// Check values
@@ -359,49 +374,67 @@ sequenceDecs_decode_56_amd64_fill_byte_by_byte:
 
 sequenceDecs_decode_56_amd64_fill_end:
 	// Update offset
-	MOVQ    R9, AX
-	MOVQ    BX, CX
-	MOVQ    DX, R15
-	SHLQ    CL, R15
-	MOVB    AH, CL
-	ADDQ    CX, BX
-	NEGL    CX
-	SHRQ    CL, R15
-	SHRQ    $0x20, AX
-	TESTQ   CX, CX
-	CMOVQEQ CX, R15
-	ADDQ    R15, AX
-	MOVQ    AX, 16(R10)
+	MOVQ  R9, AX
+	MOVQ  BX, CX
+	MOVQ  DX, R15
+	SHLQ  CL, R15
+	MOVB  AH, CL
+	SHRQ  $0x20, AX
+	TESTQ CX, CX
+	JZ    sequenceDecs_decode_56_amd64_of_update_zero
+	ADDQ  CX, BX
+	CMPQ  BX, $0x40
+	JA    sequenceDecs_decode_56_amd64_of_update_zero
+	CMPQ  CX, $0x40
+	JAE   sequenceDecs_decode_56_amd64_of_update_zero
+	NEGQ  CX
+	SHRQ  CL, R15
+	ADDQ  R15, AX
+
+sequenceDecs_decode_56_amd64_of_update_zero:
+	MOVQ AX, 16(R10)
 
 	// Update match length
-	MOVQ    R8, AX
-	MOVQ    BX, CX
-	MOVQ    DX, R15
-	SHLQ    CL, R15
-	MOVB    AH, CL
-	ADDQ    CX, BX
-	NEGL    CX
-	SHRQ    CL, R15
-	SHRQ    $0x20, AX
-	TESTQ   CX, CX
-	CMOVQEQ CX, R15
-	ADDQ    R15, AX
-	MOVQ    AX, 8(R10)
+	MOVQ  R8, AX
+	MOVQ  BX, CX
+	MOVQ  DX, R15
+	SHLQ  CL, R15
+	MOVB  AH, CL
+	SHRQ  $0x20, AX
+	TESTQ CX, CX
+	JZ    sequenceDecs_decode_56_amd64_ml_update_zero
+	ADDQ  CX, BX
+	CMPQ  BX, $0x40
+	JA    sequenceDecs_decode_56_amd64_ml_update_zero
+	CMPQ  CX, $0x40
+	JAE   sequenceDecs_decode_56_amd64_ml_update_zero
+	NEGQ  CX
+	SHRQ  CL, R15
+	ADDQ  R15, AX
+
+sequenceDecs_decode_56_amd64_ml_update_zero:
+	MOVQ AX, 8(R10)
 
 	// Update literal length
-	MOVQ    DI, AX
-	MOVQ    BX, CX
-	MOVQ    DX, R15
-	SHLQ    CL, R15
-	MOVB    AH, CL
-	ADDQ    CX, BX
-	NEGL    CX
-	SHRQ    CL, R15
-	SHRQ    $0x20, AX
-	TESTQ   CX, CX
-	CMOVQEQ CX, R15
-	ADDQ    R15, AX
-	MOVQ    AX, (R10)
+	MOVQ  DI, AX
+	MOVQ  BX, CX
+	MOVQ  DX, R15
+	SHLQ  CL, R15
+	MOVB  AH, CL
+	SHRQ  $0x20, AX
+	TESTQ CX, CX
+	JZ    sequenceDecs_decode_56_amd64_ll_update_zero
+	ADDQ  CX, BX
+	CMPQ  BX, $0x40
+	JA    sequenceDecs_decode_56_amd64_ll_update_zero
+	CMPQ  CX, $0x40
+	JAE   sequenceDecs_decode_56_amd64_ll_update_zero
+	NEGQ  CX
+	SHRQ  CL, R15
+	ADDQ  R15, AX
+
+sequenceDecs_decode_56_amd64_ll_update_zero:
+	MOVQ AX, (R10)
 
 	// Fill bitreader for state updates
 	MOVQ    R14, (SP)
@@ -416,18 +449,17 @@ sequenceDecs_decode_56_amd64_fill_end:
 	MOVBQZX DI, R14
 	SHRQ    $0x10, DI
 	MOVWQZX DI, DI
-	CMPQ    R14, $0x00
-	JZ      sequenceDecs_decode_56_amd64_llState_updateState_skip_zero
-	MOVQ    BX, CX
-	ADDQ    R14, BX
+	LEAQ    (BX)(R14*1), CX
 	MOVQ    DX, R15
-	SHLQ    CL, R15
-	MOVQ    R14, CX
-	NEGQ    CX
-	SHRQ    CL, R15
+	MOVQ    CX, BX
+	ROLQ    CL, R15
+	MOVL    $0x00000001, BP
+	MOVB    R14, CL
+	SHLL    CL, BP
+	DECL    BP
+	ANDQ    BP, R15
 	ADDQ    R15, DI
 
-sequenceDecs_decode_56_amd64_llState_updateState_skip_zero:
 	// Load ctx.llTable
 	MOVQ ctx+16(FP), CX
 	MOVQ (CX), CX
@@ -437,18 +469,17 @@ sequenceDecs_decode_56_amd64_llState_updateState_skip_zero:
 	MOVBQZX R8, R14
 	SHRQ    $0x10, R8
 	MOVWQZX R8, R8
-	CMPQ    R14, $0x00
-	JZ      sequenceDecs_decode_56_amd64_mlState_updateState_skip_zero
-	MOVQ    BX, CX
-	ADDQ    R14, BX
+	LEAQ    (BX)(R14*1), CX
 	MOVQ    DX, R15
-	SHLQ    CL, R15
-	MOVQ    R14, CX
-	NEGQ    CX
-	SHRQ    CL, R15
+	MOVQ    CX, BX
+	ROLQ    CL, R15
+	MOVL    $0x00000001, BP
+	MOVB    R14, CL
+	SHLL    CL, BP
+	DECL    BP
+	ANDQ    BP, R15
 	ADDQ    R15, R8
 
-sequenceDecs_decode_56_amd64_mlState_updateState_skip_zero:
 	// Load ctx.mlTable
 	MOVQ ctx+16(FP), CX
 	MOVQ 24(CX), CX
@@ -458,18 +489,17 @@ sequenceDecs_decode_56_amd64_mlState_updateState_skip_zero:
 	MOVBQZX R9, R14
 	SHRQ    $0x10, R9
 	MOVWQZX R9, R9
-	CMPQ    R14, $0x00
-	JZ      sequenceDecs_decode_56_amd64_ofState_updateState_skip_zero
-	MOVQ    BX, CX
-	ADDQ    R14, BX
+	LEAQ    (BX)(R14*1), CX
 	MOVQ    DX, R15
-	SHLQ    CL, R15
-	MOVQ    R14, CX
-	NEGQ    CX
-	SHRQ    CL, R15
+	MOVQ    CX, BX
+	ROLQ    CL, R15
+	MOVL    $0x00000001, BP
+	MOVB    R14, CL
+	SHLL    CL, BP
+	DECL    BP
+	ANDQ    BP, R15
 	ADDQ    R15, R9
 
-sequenceDecs_decode_56_amd64_ofState_updateState_skip_zero:
 	// Load ctx.ofTable
 	MOVQ ctx+16(FP), CX
 	MOVQ 48(CX), CX
@@ -483,7 +513,7 @@ sequenceDecs_decode_56_amd64_skip_update:
 	MOVQ R12, R13
 	MOVQ R11, R12
 	MOVQ CX, R11
-	JMP  sequenceDecs_decode_56_amd64_adjust_end
+	JMP  sequenceDecs_decode_56_amd64_after_adjust
 
 sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0:
 	CMPQ (R10), $0x00000000
@@ -495,7 +525,7 @@ sequenceDecs_decode_56_amd64_adjust_offset_maybezero:
 	TESTQ CX, CX
 	JNZ   sequenceDecs_decode_56_amd64_adjust_offset_nonzero
 	MOVQ  R11, CX
-	JMP   sequenceDecs_decode_56_amd64_adjust_end
+	JMP   sequenceDecs_decode_56_amd64_after_adjust
 
 sequenceDecs_decode_56_amd64_adjust_offset_nonzero:
 	CMPQ CX, $0x01
@@ -532,7 +562,7 @@ sequenceDecs_decode_56_amd64_adjust_temp_valid:
 	MOVQ    AX, R11
 	MOVQ    AX, CX
 
-sequenceDecs_decode_56_amd64_adjust_end:
+sequenceDecs_decode_56_amd64_after_adjust:
 	MOVQ CX, 16(R10)
 
 	// Check values
@@ -763,7 +793,7 @@ sequenceDecs_decode_bmi2_skip_update:
 	MOVQ R11, R12
 	MOVQ R10, R11
 	MOVQ CX, R10
-	JMP  sequenceDecs_decode_bmi2_adjust_end
+	JMP  sequenceDecs_decode_bmi2_after_adjust
 
 sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0:
 	CMPQ (R9), $0x00000000
@@ -775,7 +805,7 @@ sequenceDecs_decode_bmi2_adjust_offset_maybezero:
 	TESTQ CX, CX
 	JNZ   sequenceDecs_decode_bmi2_adjust_offset_nonzero
 	MOVQ  R10, CX
-	JMP   sequenceDecs_decode_bmi2_adjust_end
+	JMP   sequenceDecs_decode_bmi2_after_adjust
 
 sequenceDecs_decode_bmi2_adjust_offset_nonzero:
 	CMPQ CX, $0x01
@@ -812,7 +842,7 @@ sequenceDecs_decode_bmi2_adjust_temp_valid:
 	MOVQ    R13, R10
 	MOVQ    R13, CX
 
-sequenceDecs_decode_bmi2_adjust_end:
+sequenceDecs_decode_bmi2_after_adjust:
 	MOVQ CX, 16(R9)
 
 	// Check values
@@ -1018,7 +1048,7 @@ sequenceDecs_decode_56_bmi2_skip_update:
 	MOVQ R11, R12
 	MOVQ R10, R11
 	MOVQ CX, R10
-	JMP  sequenceDecs_decode_56_bmi2_adjust_end
+	JMP  sequenceDecs_decode_56_bmi2_after_adjust
 
 sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0:
 	CMPQ (R9), $0x00000000
@@ -1030,7 +1060,7 @@ sequenceDecs_decode_56_bmi2_adjust_offset_maybezero:
 	TESTQ CX, CX
 	JNZ   sequenceDecs_decode_56_bmi2_adjust_offset_nonzero
 	MOVQ  R10, CX
-	JMP   sequenceDecs_decode_56_bmi2_adjust_end
+	JMP   sequenceDecs_decode_56_bmi2_after_adjust
 
 sequenceDecs_decode_56_bmi2_adjust_offset_nonzero:
 	CMPQ CX, $0x01
@@ -1067,7 +1097,7 @@ sequenceDecs_decode_56_bmi2_adjust_temp_valid:
 	MOVQ    R13, R10
 	MOVQ    R13, CX
 
-sequenceDecs_decode_56_bmi2_adjust_end:
+sequenceDecs_decode_56_bmi2_after_adjust:
 	MOVQ CX, 16(R9)
 
 	// Check values
@@ -1181,52 +1211,65 @@ check_offset:
 	JG   error_match_off_too_big
 
 	// Copy match from history
-	MOVQ  R12, R11
-	SUBQ  DI, R11
-	JLS   copy_match
-	MOVQ  R9, R14
-	SUBQ  R11, R14
-	CMPQ  R13, R11
-	JGE   copy_all_from_history
-	XORQ  R11, R11
-	TESTQ $0x00000001, R13
-	JZ    copy_4_word
-	MOVB  (R14)(R11*1), R12
-	MOVB  R12, (BX)(R11*1)
-	ADDQ  $0x01, R11
-
-copy_4_word:
-	TESTQ $0x00000002, R13
-	JZ    copy_4_dword
-	MOVW  (R14)(R11*1), R12
-	MOVW  R12, (BX)(R11*1)
-	ADDQ  $0x02, R11
-
-copy_4_dword:
-	TESTQ $0x00000004, R13
-	JZ    copy_4_qword
-	MOVL  (R14)(R11*1), R12
-	MOVL  R12, (BX)(R11*1)
-	ADDQ  $0x04, R11
-
-copy_4_qword:
-	TESTQ $0x00000008, R13
-	JZ    copy_4_test
-	MOVQ  (R14)(R11*1), R12
-	MOVQ  R12, (BX)(R11*1)
-	ADDQ  $0x08, R11
-	JMP   copy_4_test
-
-copy_4:
-	MOVUPS (R14)(R11*1), X0
-	MOVUPS X0, (BX)(R11*1)
-	ADDQ   $0x10, R11
+	MOVQ R12, R11
+	SUBQ DI, R11
+	JLS  copy_match
+	MOVQ R9, R14
+	SUBQ R11, R14
+	CMPQ R13, R11
+	JG   copy_all_from_history
+	MOVQ R13, R11
+	SUBQ $0x10, R11
+	JB   copy_4_small
+
+copy_4_loop:
+	MOVUPS (R14), X0
+	MOVUPS X0, (BX)
+	ADDQ   $0x10, R14
+	ADDQ   $0x10, BX
+	SUBQ   $0x10, R11
+	JAE    copy_4_loop
+	LEAQ   16(R14)(R11*1), R14
+	LEAQ   16(BX)(R11*1), BX
+	MOVUPS -16(R14), X0
+	MOVUPS X0, -16(BX)
+	JMP    copy_4_end
+
+copy_4_small:
+	CMPQ R13, $0x03
+	JE   copy_4_move_3
+	CMPQ R13, $0x08
+	JB   copy_4_move_4through7
+	JMP  copy_4_move_8through16
+
+copy_4_move_3:
+	MOVW (R14), R11
+	MOVB 2(R14), R12
+	MOVW R11, (BX)
+	MOVB R12, 2(BX)
+	ADDQ R13, R14
+	ADDQ R13, BX
+	JMP  copy_4_end
+
+copy_4_move_4through7:
+	MOVL (R14), R11
+	MOVL -4(R14)(R13*1), R12
+	MOVL R11, (BX)
+	MOVL R12, -4(BX)(R13*1)
+	ADDQ R13, R14
+	ADDQ R13, BX
+	JMP  copy_4_end
+
+copy_4_move_8through16:
+	MOVQ (R14), R11
+	MOVQ -8(R14)(R13*1), R12
+	MOVQ R11, (BX)
+	MOVQ R12, -8(BX)(R13*1)
+	ADDQ R13, R14
+	ADDQ R13, BX
 
-copy_4_test:
-	CMPQ R11, R13
-	JB   copy_4
+copy_4_end:
 	ADDQ R13, DI
-	ADDQ R13, BX
 	ADDQ $0x18, AX
 	INCQ DX
 	CMPQ DX, CX
@@ -1234,53 +1277,74 @@ copy_4_test:
 	JMP  loop_finished
 
 copy_all_from_history:
-	XORQ  R15, R15
-	TESTQ $0x00000001, R11
-	JZ    copy_5_word
-	MOVB  (R14)(R15*1), BP
-	MOVB  BP, (BX)(R15*1)
-	ADDQ  $0x01, R15
-
-copy_5_word:
-	TESTQ $0x00000002, R11
-	JZ    copy_5_dword
-	MOVW  (R14)(R15*1), BP
-	MOVW  BP, (BX)(R15*1)
-	ADDQ  $0x02, R15
-
-copy_5_dword:
-	TESTQ $0x00000004, R11
-	JZ    copy_5_qword
-	MOVL  (R14)(R15*1), BP
-	MOVL  BP, (BX)(R15*1)
-	ADDQ  $0x04, R15
-
-copy_5_qword:
-	TESTQ $0x00000008, R11
-	JZ    copy_5_test
-	MOVQ  (R14)(R15*1), BP
-	MOVQ  BP, (BX)(R15*1)
-	ADDQ  $0x08, R15
-	JMP   copy_5_test
-
-copy_5:
-	MOVUPS (R14)(R15*1), X0
-	MOVUPS X0, (BX)(R15*1)
-	ADDQ   $0x10, R15
-
-copy_5_test:
-	CMPQ R15, R11
-	JB   copy_5
+	MOVQ R11, R15
+	SUBQ $0x10, R15
+	JB   copy_5_small
+
+copy_5_loop:
+	MOVUPS (R14), X0
+	MOVUPS X0, (BX)
+	ADDQ   $0x10, R14
+	ADDQ   $0x10, BX
+	SUBQ   $0x10, R15
+	JAE    copy_5_loop
+	LEAQ   16(R14)(R15*1), R14
+	LEAQ   16(BX)(R15*1), BX
+	MOVUPS -16(R14), X0
+	MOVUPS X0, -16(BX)
+	JMP    copy_5_end
+
+copy_5_small:
+	CMPQ R11, $0x03
+	JE   copy_5_move_3
+	JB   copy_5_move_1or2
+	CMPQ R11, $0x08
+	JB   copy_5_move_4through7
+	JMP  copy_5_move_8through16
+
+copy_5_move_1or2:
+	MOVB (R14), R15
+	MOVB -1(R14)(R11*1), BP
+	MOVB R15, (BX)
+	MOVB BP, -1(BX)(R11*1)
+	ADDQ R11, R14
 	ADDQ R11, BX
+	JMP  copy_5_end
+
+copy_5_move_3:
+	MOVW (R14), R15
+	MOVB 2(R14), BP
+	MOVW R15, (BX)
+	MOVB BP, 2(BX)
+	ADDQ R11, R14
+	ADDQ R11, BX
+	JMP  copy_5_end
+
+copy_5_move_4through7:
+	MOVL (R14), R15
+	MOVL -4(R14)(R11*1), BP
+	MOVL R15, (BX)
+	MOVL BP, -4(BX)(R11*1)
+	ADDQ R11, R14
+	ADDQ R11, BX
+	JMP  copy_5_end
+
+copy_5_move_8through16:
+	MOVQ (R14), R15
+	MOVQ -8(R14)(R11*1), BP
+	MOVQ R15, (BX)
+	MOVQ BP, -8(BX)(R11*1)
+	ADDQ R11, R14
+	ADDQ R11, BX
+
+copy_5_end:
 	ADDQ R11, DI
 	SUBQ R11, R13
 
 	// Copy match from the current buffer
 copy_match:
-	TESTQ R13, R13
-	JZ    handle_loop
-	MOVQ  BX, R11
-	SUBQ  R12, R11
+	MOVQ BX, R11
+	SUBQ R12, R11
 
 	// ml <= mo
 	CMPQ R13, R12
@@ -1382,45 +1446,67 @@ main_loop:
 	// Copy literals
 	TESTQ R11, R11
 	JZ    check_offset
-	XORQ  R14, R14
-	TESTQ $0x00000001, R11
-	JZ    copy_1_word
-	MOVB  (SI)(R14*1), R15
-	MOVB  R15, (BX)(R14*1)
-	ADDQ  $0x01, R14
-
-copy_1_word:
-	TESTQ $0x00000002, R11
-	JZ    copy_1_dword
-	MOVW  (SI)(R14*1), R15
-	MOVW  R15, (BX)(R14*1)
-	ADDQ  $0x02, R14
-
-copy_1_dword:
-	TESTQ $0x00000004, R11
-	JZ    copy_1_qword
-	MOVL  (SI)(R14*1), R15
-	MOVL  R15, (BX)(R14*1)
-	ADDQ  $0x04, R14
-
-copy_1_qword:
-	TESTQ $0x00000008, R11
-	JZ    copy_1_test
-	MOVQ  (SI)(R14*1), R15
-	MOVQ  R15, (BX)(R14*1)
-	ADDQ  $0x08, R14
-	JMP   copy_1_test
+	MOVQ  R11, R14
+	SUBQ  $0x10, R14
+	JB    copy_1_small
+
+copy_1_loop:
+	MOVUPS (SI), X0
+	MOVUPS X0, (BX)
+	ADDQ   $0x10, SI
+	ADDQ   $0x10, BX
+	SUBQ   $0x10, R14
+	JAE    copy_1_loop
+	LEAQ   16(SI)(R14*1), SI
+	LEAQ   16(BX)(R14*1), BX
+	MOVUPS -16(SI), X0
+	MOVUPS X0, -16(BX)
+	JMP    copy_1_end
+
+copy_1_small:
+	CMPQ R11, $0x03
+	JE   copy_1_move_3
+	JB   copy_1_move_1or2
+	CMPQ R11, $0x08
+	JB   copy_1_move_4through7
+	JMP  copy_1_move_8through16
+
+copy_1_move_1or2:
+	MOVB (SI), R14
+	MOVB -1(SI)(R11*1), R15
+	MOVB R14, (BX)
+	MOVB R15, -1(BX)(R11*1)
+	ADDQ R11, SI
+	ADDQ R11, BX
+	JMP  copy_1_end
 
-copy_1:
-	MOVUPS (SI)(R14*1), X0
-	MOVUPS X0, (BX)(R14*1)
-	ADDQ   $0x10, R14
+copy_1_move_3:
+	MOVW (SI), R14
+	MOVB 2(SI), R15
+	MOVW R14, (BX)
+	MOVB R15, 2(BX)
+	ADDQ R11, SI
+	ADDQ R11, BX
+	JMP  copy_1_end
 
-copy_1_test:
-	CMPQ R14, R11
-	JB   copy_1
+copy_1_move_4through7:
+	MOVL (SI), R14
+	MOVL -4(SI)(R11*1), R15
+	MOVL R14, (BX)
+	MOVL R15, -4(BX)(R11*1)
 	ADDQ R11, SI
 	ADDQ R11, BX
+	JMP  copy_1_end
+
+copy_1_move_8through16:
+	MOVQ (SI), R14
+	MOVQ -8(SI)(R11*1), R15
+	MOVQ R14, (BX)
+	MOVQ R15, -8(BX)(R11*1)
+	ADDQ R11, SI
+	ADDQ R11, BX
+
+copy_1_end:
 	ADDQ R11, DI
 
 	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
@@ -1432,52 +1518,65 @@ check_offset:
 	JG   error_match_off_too_big
 
 	// Copy match from history
-	MOVQ  R12, R11
-	SUBQ  DI, R11
-	JLS   copy_match
-	MOVQ  R9, R14
-	SUBQ  R11, R14
-	CMPQ  R13, R11
-	JGE   copy_all_from_history
-	XORQ  R11, R11
-	TESTQ $0x00000001, R13
-	JZ    copy_4_word
-	MOVB  (R14)(R11*1), R12
-	MOVB  R12, (BX)(R11*1)
-	ADDQ  $0x01, R11
-
-copy_4_word:
-	TESTQ $0x00000002, R13
-	JZ    copy_4_dword
-	MOVW  (R14)(R11*1), R12
-	MOVW  R12, (BX)(R11*1)
-	ADDQ  $0x02, R11
-
-copy_4_dword:
-	TESTQ $0x00000004, R13
-	JZ    copy_4_qword
-	MOVL  (R14)(R11*1), R12
-	MOVL  R12, (BX)(R11*1)
-	ADDQ  $0x04, R11
-
-copy_4_qword:
-	TESTQ $0x00000008, R13
-	JZ    copy_4_test
-	MOVQ  (R14)(R11*1), R12
-	MOVQ  R12, (BX)(R11*1)
-	ADDQ  $0x08, R11
-	JMP   copy_4_test
-
-copy_4:
-	MOVUPS (R14)(R11*1), X0
-	MOVUPS X0, (BX)(R11*1)
-	ADDQ   $0x10, R11
+	MOVQ R12, R11
+	SUBQ DI, R11
+	JLS  copy_match
+	MOVQ R9, R14
+	SUBQ R11, R14
+	CMPQ R13, R11
+	JG   copy_all_from_history
+	MOVQ R13, R11
+	SUBQ $0x10, R11
+	JB   copy_4_small
+
+copy_4_loop:
+	MOVUPS (R14), X0
+	MOVUPS X0, (BX)
+	ADDQ   $0x10, R14
+	ADDQ   $0x10, BX
+	SUBQ   $0x10, R11
+	JAE    copy_4_loop
+	LEAQ   16(R14)(R11*1), R14
+	LEAQ   16(BX)(R11*1), BX
+	MOVUPS -16(R14), X0
+	MOVUPS X0, -16(BX)
+	JMP    copy_4_end
+
+copy_4_small:
+	CMPQ R13, $0x03
+	JE   copy_4_move_3
+	CMPQ R13, $0x08
+	JB   copy_4_move_4through7
+	JMP  copy_4_move_8through16
+
+copy_4_move_3:
+	MOVW (R14), R11
+	MOVB 2(R14), R12
+	MOVW R11, (BX)
+	MOVB R12, 2(BX)
+	ADDQ R13, R14
+	ADDQ R13, BX
+	JMP  copy_4_end
+
+copy_4_move_4through7:
+	MOVL (R14), R11
+	MOVL -4(R14)(R13*1), R12
+	MOVL R11, (BX)
+	MOVL R12, -4(BX)(R13*1)
+	ADDQ R13, R14
+	ADDQ R13, BX
+	JMP  copy_4_end
+
+copy_4_move_8through16:
+	MOVQ (R14), R11
+	MOVQ -8(R14)(R13*1), R12
+	MOVQ R11, (BX)
+	MOVQ R12, -8(BX)(R13*1)
+	ADDQ R13, R14
+	ADDQ R13, BX
 
-copy_4_test:
-	CMPQ R11, R13
-	JB   copy_4
+copy_4_end:
 	ADDQ R13, DI
-	ADDQ R13, BX
 	ADDQ $0x18, AX
 	INCQ DX
 	CMPQ DX, CX
@@ -1485,99 +1584,143 @@ copy_4_test:
 	JMP  loop_finished
 
 copy_all_from_history:
-	XORQ  R15, R15
-	TESTQ $0x00000001, R11
-	JZ    copy_5_word
-	MOVB  (R14)(R15*1), BP
-	MOVB  BP, (BX)(R15*1)
-	ADDQ  $0x01, R15
-
-copy_5_word:
-	TESTQ $0x00000002, R11
-	JZ    copy_5_dword
-	MOVW  (R14)(R15*1), BP
-	MOVW  BP, (BX)(R15*1)
-	ADDQ  $0x02, R15
-
-copy_5_dword:
-	TESTQ $0x00000004, R11
-	JZ    copy_5_qword
-	MOVL  (R14)(R15*1), BP
-	MOVL  BP, (BX)(R15*1)
-	ADDQ  $0x04, R15
-
-copy_5_qword:
-	TESTQ $0x00000008, R11
-	JZ    copy_5_test
-	MOVQ  (R14)(R15*1), BP
-	MOVQ  BP, (BX)(R15*1)
-	ADDQ  $0x08, R15
-	JMP   copy_5_test
-
-copy_5:
-	MOVUPS (R14)(R15*1), X0
-	MOVUPS X0, (BX)(R15*1)
-	ADDQ   $0x10, R15
-
-copy_5_test:
-	CMPQ R15, R11
-	JB   copy_5
+	MOVQ R11, R15
+	SUBQ $0x10, R15
+	JB   copy_5_small
+
+copy_5_loop:
+	MOVUPS (R14), X0
+	MOVUPS X0, (BX)
+	ADDQ   $0x10, R14
+	ADDQ   $0x10, BX
+	SUBQ   $0x10, R15
+	JAE    copy_5_loop
+	LEAQ   16(R14)(R15*1), R14
+	LEAQ   16(BX)(R15*1), BX
+	MOVUPS -16(R14), X0
+	MOVUPS X0, -16(BX)
+	JMP    copy_5_end
+
+copy_5_small:
+	CMPQ R11, $0x03
+	JE   copy_5_move_3
+	JB   copy_5_move_1or2
+	CMPQ R11, $0x08
+	JB   copy_5_move_4through7
+	JMP  copy_5_move_8through16
+
+copy_5_move_1or2:
+	MOVB (R14), R15
+	MOVB -1(R14)(R11*1), BP
+	MOVB R15, (BX)
+	MOVB BP, -1(BX)(R11*1)
+	ADDQ R11, R14
+	ADDQ R11, BX
+	JMP  copy_5_end
+
+copy_5_move_3:
+	MOVW (R14), R15
+	MOVB 2(R14), BP
+	MOVW R15, (BX)
+	MOVB BP, 2(BX)
+	ADDQ R11, R14
 	ADDQ R11, BX
+	JMP  copy_5_end
+
+copy_5_move_4through7:
+	MOVL (R14), R15
+	MOVL -4(R14)(R11*1), BP
+	MOVL R15, (BX)
+	MOVL BP, -4(BX)(R11*1)
+	ADDQ R11, R14
+	ADDQ R11, BX
+	JMP  copy_5_end
+
+copy_5_move_8through16:
+	MOVQ (R14), R15
+	MOVQ -8(R14)(R11*1), BP
+	MOVQ R15, (BX)
+	MOVQ BP, -8(BX)(R11*1)
+	ADDQ R11, R14
+	ADDQ R11, BX
+
+copy_5_end:
 	ADDQ R11, DI
 	SUBQ R11, R13
 
 	// Copy match from the current buffer
 copy_match:
-	TESTQ R13, R13
-	JZ    handle_loop
-	MOVQ  BX, R11
-	SUBQ  R12, R11
+	MOVQ BX, R11
+	SUBQ R12, R11
 
 	// ml <= mo
 	CMPQ R13, R12
 	JA   copy_overlapping_match
 
 	// Copy non-overlapping match
-	ADDQ  R13, DI
-	XORQ  R12, R12
-	TESTQ $0x00000001, R13
-	JZ    copy_2_word
-	MOVB  (R11)(R12*1), R14
-	MOVB  R14, (BX)(R12*1)
-	ADDQ  $0x01, R12
-
-copy_2_word:
-	TESTQ $0x00000002, R13
-	JZ    copy_2_dword
-	MOVW  (R11)(R12*1), R14
-	MOVW  R14, (BX)(R12*1)
-	ADDQ  $0x02, R12
-
-copy_2_dword:
-	TESTQ $0x00000004, R13
-	JZ    copy_2_qword
-	MOVL  (R11)(R12*1), R14
-	MOVL  R14, (BX)(R12*1)
-	ADDQ  $0x04, R12
-
-copy_2_qword:
-	TESTQ $0x00000008, R13
-	JZ    copy_2_test
-	MOVQ  (R11)(R12*1), R14
-	MOVQ  R14, (BX)(R12*1)
-	ADDQ  $0x08, R12
-	JMP   copy_2_test
+	ADDQ R13, DI
+	MOVQ R13, R12
+	SUBQ $0x10, R12
+	JB   copy_2_small
 
-copy_2:
-	MOVUPS (R11)(R12*1), X0
-	MOVUPS X0, (BX)(R12*1)
-	ADDQ   $0x10, R12
+copy_2_loop:
+	MOVUPS (R11), X0
+	MOVUPS X0, (BX)
+	ADDQ   $0x10, R11
+	ADDQ   $0x10, BX
+	SUBQ   $0x10, R12
+	JAE    copy_2_loop
+	LEAQ   16(R11)(R12*1), R11
+	LEAQ   16(BX)(R12*1), BX
+	MOVUPS -16(R11), X0
+	MOVUPS X0, -16(BX)
+	JMP    copy_2_end
+
+copy_2_small:
+	CMPQ R13, $0x03
+	JE   copy_2_move_3
+	JB   copy_2_move_1or2
+	CMPQ R13, $0x08
+	JB   copy_2_move_4through7
+	JMP  copy_2_move_8through16
+
+copy_2_move_1or2:
+	MOVB (R11), R12
+	MOVB -1(R11)(R13*1), R14
+	MOVB R12, (BX)
+	MOVB R14, -1(BX)(R13*1)
+	ADDQ R13, R11
+	ADDQ R13, BX
+	JMP  copy_2_end
 
-copy_2_test:
-	CMPQ R12, R13
-	JB   copy_2
+copy_2_move_3:
+	MOVW (R11), R12
+	MOVB 2(R11), R14
+	MOVW R12, (BX)
+	MOVB R14, 2(BX)
+	ADDQ R13, R11
+	ADDQ R13, BX
+	JMP  copy_2_end
+
+copy_2_move_4through7:
+	MOVL (R11), R12
+	MOVL -4(R11)(R13*1), R14
+	MOVL R12, (BX)
+	MOVL R14, -4(BX)(R13*1)
+	ADDQ R13, R11
+	ADDQ R13, BX
+	JMP  copy_2_end
+
+copy_2_move_8through16:
+	MOVQ (R11), R12
+	MOVQ -8(R11)(R13*1), R14
+	MOVQ R12, (BX)
+	MOVQ R14, -8(BX)(R13*1)
+	ADDQ R13, R11
 	ADDQ R13, BX
-	JMP  handle_loop
+
+copy_2_end:
+	JMP handle_loop
 
 	// Copy overlapping match
 copy_overlapping_match:
@@ -1642,6 +1785,10 @@ TEXT ·sequenceDecs_decodeSync_amd64(SB), $64-32
 	MOVQ    72(AX), DI
 	MOVQ    80(AX), R8
 	MOVQ    88(AX), R9
+	XORQ    CX, CX
+	MOVQ    CX, 8(SP)
+	MOVQ    CX, 16(SP)
+	MOVQ    CX, 24(SP)
 	MOVQ    112(AX), R10
 	MOVQ    128(AX), CX
 	MOVQ    CX, 32(SP)
@@ -1691,34 +1838,46 @@ sequenceDecs_decodeSync_amd64_fill_byte_by_byte:
 
 sequenceDecs_decodeSync_amd64_fill_end:
 	// Update offset
-	MOVQ    R9, AX
-	MOVQ    BX, CX
-	MOVQ    DX, R14
-	SHLQ    CL, R14
-	MOVB    AH, CL
-	ADDQ    CX, BX
-	NEGL    CX
-	SHRQ    CL, R14
-	SHRQ    $0x20, AX
-	TESTQ   CX, CX
-	CMOVQEQ CX, R14
-	ADDQ    R14, AX
-	MOVQ    AX, 8(SP)
+	MOVQ  R9, AX
+	MOVQ  BX, CX
+	MOVQ  DX, R14
+	SHLQ  CL, R14
+	MOVB  AH, CL
+	SHRQ  $0x20, AX
+	TESTQ CX, CX
+	JZ    sequenceDecs_decodeSync_amd64_of_update_zero
+	ADDQ  CX, BX
+	CMPQ  BX, $0x40
+	JA    sequenceDecs_decodeSync_amd64_of_update_zero
+	CMPQ  CX, $0x40
+	JAE   sequenceDecs_decodeSync_amd64_of_update_zero
+	NEGQ  CX
+	SHRQ  CL, R14
+	ADDQ  R14, AX
+
+sequenceDecs_decodeSync_amd64_of_update_zero:
+	MOVQ AX, 8(SP)
 
 	// Update match length
-	MOVQ    R8, AX
-	MOVQ    BX, CX
-	MOVQ    DX, R14
-	SHLQ    CL, R14
-	MOVB    AH, CL
-	ADDQ    CX, BX
-	NEGL    CX
-	SHRQ    CL, R14
-	SHRQ    $0x20, AX
-	TESTQ   CX, CX
-	CMOVQEQ CX, R14
-	ADDQ    R14, AX
-	MOVQ    AX, 16(SP)
+	MOVQ  R8, AX
+	MOVQ  BX, CX
+	MOVQ  DX, R14
+	SHLQ  CL, R14
+	MOVB  AH, CL
+	SHRQ  $0x20, AX
+	TESTQ CX, CX
+	JZ    sequenceDecs_decodeSync_amd64_ml_update_zero
+	ADDQ  CX, BX
+	CMPQ  BX, $0x40
+	JA    sequenceDecs_decodeSync_amd64_ml_update_zero
+	CMPQ  CX, $0x40
+	JAE   sequenceDecs_decodeSync_amd64_ml_update_zero
+	NEGQ  CX
+	SHRQ  CL, R14
+	ADDQ  R14, AX
+
+sequenceDecs_decodeSync_amd64_ml_update_zero:
+	MOVQ AX, 16(SP)
 
 	// Fill bitreader to have enough for the remaining
 	CMPQ SI, $0x08
@@ -1746,19 +1905,25 @@ sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte:
 
 sequenceDecs_decodeSync_amd64_fill_2_end:
 	// Update literal length
-	MOVQ    DI, AX
-	MOVQ    BX, CX
-	MOVQ    DX, R14
-	SHLQ    CL, R14
-	MOVB    AH, CL
-	ADDQ    CX, BX
-	NEGL    CX
-	SHRQ    CL, R14
-	SHRQ    $0x20, AX
-	TESTQ   CX, CX
-	CMOVQEQ CX, R14
-	ADDQ    R14, AX
-	MOVQ    AX, 24(SP)
+	MOVQ  DI, AX
+	MOVQ  BX, CX
+	MOVQ  DX, R14
+	SHLQ  CL, R14
+	MOVB  AH, CL
+	SHRQ  $0x20, AX
+	TESTQ CX, CX
+	JZ    sequenceDecs_decodeSync_amd64_ll_update_zero
+	ADDQ  CX, BX
+	CMPQ  BX, $0x40
+	JA    sequenceDecs_decodeSync_amd64_ll_update_zero
+	CMPQ  CX, $0x40
+	JAE   sequenceDecs_decodeSync_amd64_ll_update_zero
+	NEGQ  CX
+	SHRQ  CL, R14
+	ADDQ  R14, AX
+
+sequenceDecs_decodeSync_amd64_ll_update_zero:
+	MOVQ AX, 24(SP)
 
 	// Fill bitreader for state updates
 	MOVQ    R13, (SP)
@@ -1773,18 +1938,17 @@ sequenceDecs_decodeSync_amd64_fill_2_end:
 	MOVBQZX DI, R13
 	SHRQ    $0x10, DI
 	MOVWQZX DI, DI
-	CMPQ    R13, $0x00
-	JZ      sequenceDecs_decodeSync_amd64_llState_updateState_skip_zero
-	MOVQ    BX, CX
-	ADDQ    R13, BX
+	LEAQ    (BX)(R13*1), CX
 	MOVQ    DX, R14
-	SHLQ    CL, R14
-	MOVQ    R13, CX
-	NEGQ    CX
-	SHRQ    CL, R14
+	MOVQ    CX, BX
+	ROLQ    CL, R14
+	MOVL    $0x00000001, R15
+	MOVB    R13, CL
+	SHLL    CL, R15
+	DECL    R15
+	ANDQ    R15, R14
 	ADDQ    R14, DI
 
-sequenceDecs_decodeSync_amd64_llState_updateState_skip_zero:
 	// Load ctx.llTable
 	MOVQ ctx+16(FP), CX
 	MOVQ (CX), CX
@@ -1794,18 +1958,17 @@ sequenceDecs_decodeSync_amd64_llState_updateState_skip_zero:
 	MOVBQZX R8, R13
 	SHRQ    $0x10, R8
 	MOVWQZX R8, R8
-	CMPQ    R13, $0x00
-	JZ      sequenceDecs_decodeSync_amd64_mlState_updateState_skip_zero
-	MOVQ    BX, CX
-	ADDQ    R13, BX
+	LEAQ    (BX)(R13*1), CX
 	MOVQ    DX, R14
-	SHLQ    CL, R14
-	MOVQ    R13, CX
-	NEGQ    CX
-	SHRQ    CL, R14
+	MOVQ    CX, BX
+	ROLQ    CL, R14
+	MOVL    $0x00000001, R15
+	MOVB    R13, CL
+	SHLL    CL, R15
+	DECL    R15
+	ANDQ    R15, R14
 	ADDQ    R14, R8
 
-sequenceDecs_decodeSync_amd64_mlState_updateState_skip_zero:
 	// Load ctx.mlTable
 	MOVQ ctx+16(FP), CX
 	MOVQ 24(CX), CX
@@ -1815,18 +1978,17 @@ sequenceDecs_decodeSync_amd64_mlState_updateState_skip_zero:
 	MOVBQZX R9, R13
 	SHRQ    $0x10, R9
 	MOVWQZX R9, R9
-	CMPQ    R13, $0x00
-	JZ      sequenceDecs_decodeSync_amd64_ofState_updateState_skip_zero
-	MOVQ    BX, CX
-	ADDQ    R13, BX
+	LEAQ    (BX)(R13*1), CX
 	MOVQ    DX, R14
-	SHLQ    CL, R14
-	MOVQ    R13, CX
-	NEGQ    CX
-	SHRQ    CL, R14
+	MOVQ    CX, BX
+	ROLQ    CL, R14
+	MOVL    $0x00000001, R15
+	MOVB    R13, CL
+	SHLL    CL, R15
+	DECL    R15
+	ANDQ    R15, R14
 	ADDQ    R14, R9
 
-sequenceDecs_decodeSync_amd64_ofState_updateState_skip_zero:
 	// Load ctx.ofTable
 	MOVQ ctx+16(FP), CX
 	MOVQ 48(CX), CX
@@ -1841,7 +2003,7 @@ sequenceDecs_decodeSync_amd64_skip_update:
 	MOVUPS 144(CX), X0
 	MOVQ   R13, 144(CX)
 	MOVUPS X0, 152(CX)
-	JMP    sequenceDecs_decodeSync_amd64_adjust_end
+	JMP    sequenceDecs_decodeSync_amd64_after_adjust
 
 sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0:
 	CMPQ 24(SP), $0x00000000
@@ -1853,7 +2015,7 @@ sequenceDecs_decodeSync_amd64_adjust_offset_maybezero:
 	TESTQ R13, R13
 	JNZ   sequenceDecs_decodeSync_amd64_adjust_offset_nonzero
 	MOVQ  144(CX), R13
-	JMP   sequenceDecs_decodeSync_amd64_adjust_end
+	JMP   sequenceDecs_decodeSync_amd64_after_adjust
 
 sequenceDecs_decodeSync_amd64_adjust_offset_nonzero:
 	MOVQ    R13, AX
@@ -1862,8 +2024,7 @@ sequenceDecs_decodeSync_amd64_adjust_offset_nonzero:
 	CMPQ    R13, $0x03
 	CMOVQEQ R14, AX
 	CMOVQEQ R15, R14
-	LEAQ    144(CX), R15
-	ADDQ    (R15)(AX*8), R14
+	ADDQ    144(CX)(AX*8), R14
 	JNZ     sequenceDecs_decodeSync_amd64_adjust_temp_valid
 	MOVQ    $0x00000001, R14
 
@@ -1879,7 +2040,7 @@ sequenceDecs_decodeSync_amd64_adjust_skip:
 	MOVQ R14, 144(CX)
 	MOVQ R14, R13
 
-sequenceDecs_decodeSync_amd64_adjust_end:
+sequenceDecs_decodeSync_amd64_after_adjust:
 	MOVQ R13, 8(SP)
 
 	// Check values
@@ -1934,103 +2095,137 @@ check_offset:
 	JG   error_match_off_too_big
 
 	// Copy match from history
-	MOVQ  CX, AX
-	SUBQ  R12, AX
-	JLS   copy_match
-	MOVQ  48(SP), R14
-	SUBQ  AX, R14
-	CMPQ  R13, AX
-	JGE   copy_all_from_history
-	XORQ  AX, AX
-	TESTQ $0x00000001, R13
-	JZ    copy_4_word
-	MOVB  (R14)(AX*1), CL
-	MOVB  CL, (R10)(AX*1)
-	ADDQ  $0x01, AX
-
-copy_4_word:
-	TESTQ $0x00000002, R13
-	JZ    copy_4_dword
-	MOVW  (R14)(AX*1), CX
-	MOVW  CX, (R10)(AX*1)
-	ADDQ  $0x02, AX
-
-copy_4_dword:
-	TESTQ $0x00000004, R13
-	JZ    copy_4_qword
-	MOVL  (R14)(AX*1), CX
-	MOVL  CX, (R10)(AX*1)
-	ADDQ  $0x04, AX
-
-copy_4_qword:
-	TESTQ $0x00000008, R13
-	JZ    copy_4_test
-	MOVQ  (R14)(AX*1), CX
-	MOVQ  CX, (R10)(AX*1)
-	ADDQ  $0x08, AX
-	JMP   copy_4_test
-
-copy_4:
-	MOVUPS (R14)(AX*1), X0
-	MOVUPS X0, (R10)(AX*1)
-	ADDQ   $0x10, AX
+	MOVQ CX, AX
+	SUBQ R12, AX
+	JLS  copy_match
+	MOVQ 48(SP), R14
+	SUBQ AX, R14
+	CMPQ R13, AX
+	JG   copy_all_from_history
+	MOVQ R13, AX
+	SUBQ $0x10, AX
+	JB   copy_4_small
 
-copy_4_test:
-	CMPQ AX, R13
-	JB   copy_4
-	ADDQ R13, R12
+copy_4_loop:
+	MOVUPS (R14), X0
+	MOVUPS X0, (R10)
+	ADDQ   $0x10, R14
+	ADDQ   $0x10, R10
+	SUBQ   $0x10, AX
+	JAE    copy_4_loop
+	LEAQ   16(R14)(AX*1), R14
+	LEAQ   16(R10)(AX*1), R10
+	MOVUPS -16(R14), X0
+	MOVUPS X0, -16(R10)
+	JMP    copy_4_end
+
+copy_4_small:
+	CMPQ R13, $0x03
+	JE   copy_4_move_3
+	CMPQ R13, $0x08
+	JB   copy_4_move_4through7
+	JMP  copy_4_move_8through16
+
+copy_4_move_3:
+	MOVW (R14), AX
+	MOVB 2(R14), CL
+	MOVW AX, (R10)
+	MOVB CL, 2(R10)
+	ADDQ R13, R14
+	ADDQ R13, R10
+	JMP  copy_4_end
+
+copy_4_move_4through7:
+	MOVL (R14), AX
+	MOVL -4(R14)(R13*1), CX
+	MOVL AX, (R10)
+	MOVL CX, -4(R10)(R13*1)
+	ADDQ R13, R14
 	ADDQ R13, R10
+	JMP  copy_4_end
+
+copy_4_move_8through16:
+	MOVQ (R14), AX
+	MOVQ -8(R14)(R13*1), CX
+	MOVQ AX, (R10)
+	MOVQ CX, -8(R10)(R13*1)
+	ADDQ R13, R14
+	ADDQ R13, R10
+
+copy_4_end:
+	ADDQ R13, R12
 	JMP  handle_loop
 	JMP loop_finished
 
 copy_all_from_history:
-	XORQ  R15, R15
-	TESTQ $0x00000001, AX
-	JZ    copy_5_word
-	MOVB  (R14)(R15*1), BP
-	MOVB  BP, (R10)(R15*1)
-	ADDQ  $0x01, R15
-
-copy_5_word:
-	TESTQ $0x00000002, AX
-	JZ    copy_5_dword
-	MOVW  (R14)(R15*1), BP
-	MOVW  BP, (R10)(R15*1)
-	ADDQ  $0x02, R15
-
-copy_5_dword:
-	TESTQ $0x00000004, AX
-	JZ    copy_5_qword
-	MOVL  (R14)(R15*1), BP
-	MOVL  BP, (R10)(R15*1)
-	ADDQ  $0x04, R15
-
-copy_5_qword:
-	TESTQ $0x00000008, AX
-	JZ    copy_5_test
-	MOVQ  (R14)(R15*1), BP
-	MOVQ  BP, (R10)(R15*1)
-	ADDQ  $0x08, R15
-	JMP   copy_5_test
-
-copy_5:
-	MOVUPS (R14)(R15*1), X0
-	MOVUPS X0, (R10)(R15*1)
-	ADDQ   $0x10, R15
-
-copy_5_test:
-	CMPQ R15, AX
-	JB   copy_5
+	MOVQ AX, R15
+	SUBQ $0x10, R15
+	JB   copy_5_small
+
+copy_5_loop:
+	MOVUPS (R14), X0
+	MOVUPS X0, (R10)
+	ADDQ   $0x10, R14
+	ADDQ   $0x10, R10
+	SUBQ   $0x10, R15
+	JAE    copy_5_loop
+	LEAQ   16(R14)(R15*1), R14
+	LEAQ   16(R10)(R15*1), R10
+	MOVUPS -16(R14), X0
+	MOVUPS X0, -16(R10)
+	JMP    copy_5_end
+
+copy_5_small:
+	CMPQ AX, $0x03
+	JE   copy_5_move_3
+	JB   copy_5_move_1or2
+	CMPQ AX, $0x08
+	JB   copy_5_move_4through7
+	JMP  copy_5_move_8through16
+
+copy_5_move_1or2:
+	MOVB (R14), R15
+	MOVB -1(R14)(AX*1), BP
+	MOVB R15, (R10)
+	MOVB BP, -1(R10)(AX*1)
+	ADDQ AX, R14
+	ADDQ AX, R10
+	JMP  copy_5_end
+
+copy_5_move_3:
+	MOVW (R14), R15
+	MOVB 2(R14), BP
+	MOVW R15, (R10)
+	MOVB BP, 2(R10)
+	ADDQ AX, R14
+	ADDQ AX, R10
+	JMP  copy_5_end
+
+copy_5_move_4through7:
+	MOVL (R14), R15
+	MOVL -4(R14)(AX*1), BP
+	MOVL R15, (R10)
+	MOVL BP, -4(R10)(AX*1)
+	ADDQ AX, R14
 	ADDQ AX, R10
+	JMP  copy_5_end
+
+copy_5_move_8through16:
+	MOVQ (R14), R15
+	MOVQ -8(R14)(AX*1), BP
+	MOVQ R15, (R10)
+	MOVQ BP, -8(R10)(AX*1)
+	ADDQ AX, R14
+	ADDQ AX, R10
+
+copy_5_end:
 	ADDQ AX, R12
 	SUBQ AX, R13
 
 	// Copy match from the current buffer
 copy_match:
-	TESTQ R13, R13
-	JZ    handle_loop
-	MOVQ  R10, AX
-	SUBQ  CX, AX
+	MOVQ R10, AX
+	SUBQ CX, AX
 
 	// ml <= mo
 	CMPQ R13, CX
@@ -2142,6 +2337,10 @@ TEXT ·sequenceDecs_decodeSync_bmi2(SB), $64-32
 	MOVQ    72(CX), SI
 	MOVQ    80(CX), DI
 	MOVQ    88(CX), R8
+	XORQ    R9, R9
+	MOVQ    R9, 8(SP)
+	MOVQ    R9, 16(SP)
+	MOVQ    R9, 24(SP)
 	MOVQ    112(CX), R9
 	MOVQ    128(CX), R10
 	MOVQ    R10, 32(SP)
@@ -2314,7 +2513,7 @@ sequenceDecs_decodeSync_bmi2_skip_update:
 	MOVUPS 144(CX), X0
 	MOVQ   R13, 144(CX)
 	MOVUPS X0, 152(CX)
-	JMP    sequenceDecs_decodeSync_bmi2_adjust_end
+	JMP    sequenceDecs_decodeSync_bmi2_after_adjust
 
 sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0:
 	CMPQ 24(SP), $0x00000000
@@ -2326,7 +2525,7 @@ sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero:
 	TESTQ R13, R13
 	JNZ   sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero
 	MOVQ  144(CX), R13
-	JMP   sequenceDecs_decodeSync_bmi2_adjust_end
+	JMP   sequenceDecs_decodeSync_bmi2_after_adjust
 
 sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero:
 	MOVQ    R13, R12
@@ -2335,8 +2534,7 @@ sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero:
 	CMPQ    R13, $0x03
 	CMOVQEQ R14, R12
 	CMOVQEQ R15, R14
-	LEAQ    144(CX), R15
-	ADDQ    (R15)(R12*8), R14
+	ADDQ    144(CX)(R12*8), R14
 	JNZ     sequenceDecs_decodeSync_bmi2_adjust_temp_valid
 	MOVQ    $0x00000001, R14
 
@@ -2352,7 +2550,7 @@ sequenceDecs_decodeSync_bmi2_adjust_skip:
 	MOVQ R14, 144(CX)
 	MOVQ R14, R13
 
-sequenceDecs_decodeSync_bmi2_adjust_end:
+sequenceDecs_decodeSync_bmi2_after_adjust:
 	MOVQ R13, 8(SP)
 
 	// Check values
@@ -2407,103 +2605,137 @@ check_offset:
 	JG   error_match_off_too_big
 
 	// Copy match from history
-	MOVQ  R12, CX
-	SUBQ  R11, CX
-	JLS   copy_match
-	MOVQ  48(SP), R14
-	SUBQ  CX, R14
-	CMPQ  R13, CX
-	JGE   copy_all_from_history
-	XORQ  CX, CX
-	TESTQ $0x00000001, R13
-	JZ    copy_4_word
-	MOVB  (R14)(CX*1), R12
-	MOVB  R12, (R9)(CX*1)
-	ADDQ  $0x01, CX
-
-copy_4_word:
-	TESTQ $0x00000002, R13
-	JZ    copy_4_dword
-	MOVW  (R14)(CX*1), R12
-	MOVW  R12, (R9)(CX*1)
-	ADDQ  $0x02, CX
-
-copy_4_dword:
-	TESTQ $0x00000004, R13
-	JZ    copy_4_qword
-	MOVL  (R14)(CX*1), R12
-	MOVL  R12, (R9)(CX*1)
-	ADDQ  $0x04, CX
-
-copy_4_qword:
-	TESTQ $0x00000008, R13
-	JZ    copy_4_test
-	MOVQ  (R14)(CX*1), R12
-	MOVQ  R12, (R9)(CX*1)
-	ADDQ  $0x08, CX
-	JMP   copy_4_test
-
-copy_4:
-	MOVUPS (R14)(CX*1), X0
-	MOVUPS X0, (R9)(CX*1)
-	ADDQ   $0x10, CX
+	MOVQ R12, CX
+	SUBQ R11, CX
+	JLS  copy_match
+	MOVQ 48(SP), R14
+	SUBQ CX, R14
+	CMPQ R13, CX
+	JG   copy_all_from_history
+	MOVQ R13, CX
+	SUBQ $0x10, CX
+	JB   copy_4_small
+
+copy_4_loop:
+	MOVUPS (R14), X0
+	MOVUPS X0, (R9)
+	ADDQ   $0x10, R14
+	ADDQ   $0x10, R9
+	SUBQ   $0x10, CX
+	JAE    copy_4_loop
+	LEAQ   16(R14)(CX*1), R14
+	LEAQ   16(R9)(CX*1), R9
+	MOVUPS -16(R14), X0
+	MOVUPS X0, -16(R9)
+	JMP    copy_4_end
+
+copy_4_small:
+	CMPQ R13, $0x03
+	JE   copy_4_move_3
+	CMPQ R13, $0x08
+	JB   copy_4_move_4through7
+	JMP  copy_4_move_8through16
+
+copy_4_move_3:
+	MOVW (R14), CX
+	MOVB 2(R14), R12
+	MOVW CX, (R9)
+	MOVB R12, 2(R9)
+	ADDQ R13, R14
+	ADDQ R13, R9
+	JMP  copy_4_end
+
+copy_4_move_4through7:
+	MOVL (R14), CX
+	MOVL -4(R14)(R13*1), R12
+	MOVL CX, (R9)
+	MOVL R12, -4(R9)(R13*1)
+	ADDQ R13, R14
+	ADDQ R13, R9
+	JMP  copy_4_end
+
+copy_4_move_8through16:
+	MOVQ (R14), CX
+	MOVQ -8(R14)(R13*1), R12
+	MOVQ CX, (R9)
+	MOVQ R12, -8(R9)(R13*1)
+	ADDQ R13, R14
+	ADDQ R13, R9
 
-copy_4_test:
-	CMPQ CX, R13
-	JB   copy_4
+copy_4_end:
 	ADDQ R13, R11
-	ADDQ R13, R9
 	JMP  handle_loop
 	JMP loop_finished
 
 copy_all_from_history:
-	XORQ  R15, R15
-	TESTQ $0x00000001, CX
-	JZ    copy_5_word
-	MOVB  (R14)(R15*1), BP
-	MOVB  BP, (R9)(R15*1)
-	ADDQ  $0x01, R15
-
-copy_5_word:
-	TESTQ $0x00000002, CX
-	JZ    copy_5_dword
-	MOVW  (R14)(R15*1), BP
-	MOVW  BP, (R9)(R15*1)
-	ADDQ  $0x02, R15
-
-copy_5_dword:
-	TESTQ $0x00000004, CX
-	JZ    copy_5_qword
-	MOVL  (R14)(R15*1), BP
-	MOVL  BP, (R9)(R15*1)
-	ADDQ  $0x04, R15
-
-copy_5_qword:
-	TESTQ $0x00000008, CX
-	JZ    copy_5_test
-	MOVQ  (R14)(R15*1), BP
-	MOVQ  BP, (R9)(R15*1)
-	ADDQ  $0x08, R15
-	JMP   copy_5_test
-
-copy_5:
-	MOVUPS (R14)(R15*1), X0
-	MOVUPS X0, (R9)(R15*1)
-	ADDQ   $0x10, R15
-
-copy_5_test:
-	CMPQ R15, CX
-	JB   copy_5
+	MOVQ CX, R15
+	SUBQ $0x10, R15
+	JB   copy_5_small
+
+copy_5_loop:
+	MOVUPS (R14), X0
+	MOVUPS X0, (R9)
+	ADDQ   $0x10, R14
+	ADDQ   $0x10, R9
+	SUBQ   $0x10, R15
+	JAE    copy_5_loop
+	LEAQ   16(R14)(R15*1), R14
+	LEAQ   16(R9)(R15*1), R9
+	MOVUPS -16(R14), X0
+	MOVUPS X0, -16(R9)
+	JMP    copy_5_end
+
+copy_5_small:
+	CMPQ CX, $0x03
+	JE   copy_5_move_3
+	JB   copy_5_move_1or2
+	CMPQ CX, $0x08
+	JB   copy_5_move_4through7
+	JMP  copy_5_move_8through16
+
+copy_5_move_1or2:
+	MOVB (R14), R15
+	MOVB -1(R14)(CX*1), BP
+	MOVB R15, (R9)
+	MOVB BP, -1(R9)(CX*1)
+	ADDQ CX, R14
+	ADDQ CX, R9
+	JMP  copy_5_end
+
+copy_5_move_3:
+	MOVW (R14), R15
+	MOVB 2(R14), BP
+	MOVW R15, (R9)
+	MOVB BP, 2(R9)
+	ADDQ CX, R14
 	ADDQ CX, R9
+	JMP  copy_5_end
+
+copy_5_move_4through7:
+	MOVL (R14), R15
+	MOVL -4(R14)(CX*1), BP
+	MOVL R15, (R9)
+	MOVL BP, -4(R9)(CX*1)
+	ADDQ CX, R14
+	ADDQ CX, R9
+	JMP  copy_5_end
+
+copy_5_move_8through16:
+	MOVQ (R14), R15
+	MOVQ -8(R14)(CX*1), BP
+	MOVQ R15, (R9)
+	MOVQ BP, -8(R9)(CX*1)
+	ADDQ CX, R14
+	ADDQ CX, R9
+
+copy_5_end:
 	ADDQ CX, R11
 	SUBQ CX, R13
 
 	// Copy match from the current buffer
 copy_match:
-	TESTQ R13, R13
-	JZ    handle_loop
-	MOVQ  R9, CX
-	SUBQ  R12, CX
+	MOVQ R9, CX
+	SUBQ R12, CX
 
 	// ml <= mo
 	CMPQ R13, R12
@@ -2615,6 +2847,10 @@ TEXT ·sequenceDecs_decodeSync_safe_amd64(SB), $64-32
 	MOVQ    72(AX), DI
 	MOVQ    80(AX), R8
 	MOVQ    88(AX), R9
+	XORQ    CX, CX
+	MOVQ    CX, 8(SP)
+	MOVQ    CX, 16(SP)
+	MOVQ    CX, 24(SP)
 	MOVQ    112(AX), R10
 	MOVQ    128(AX), CX
 	MOVQ    CX, 32(SP)
@@ -2664,34 +2900,46 @@ sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte:
 
 sequenceDecs_decodeSync_safe_amd64_fill_end:
 	// Update offset
-	MOVQ    R9, AX
-	MOVQ    BX, CX
-	MOVQ    DX, R14
-	SHLQ    CL, R14
-	MOVB    AH, CL
-	ADDQ    CX, BX
-	NEGL    CX
-	SHRQ    CL, R14
-	SHRQ    $0x20, AX
-	TESTQ   CX, CX
-	CMOVQEQ CX, R14
-	ADDQ    R14, AX
-	MOVQ    AX, 8(SP)
+	MOVQ  R9, AX
+	MOVQ  BX, CX
+	MOVQ  DX, R14
+	SHLQ  CL, R14
+	MOVB  AH, CL
+	SHRQ  $0x20, AX
+	TESTQ CX, CX
+	JZ    sequenceDecs_decodeSync_safe_amd64_of_update_zero
+	ADDQ  CX, BX
+	CMPQ  BX, $0x40
+	JA    sequenceDecs_decodeSync_safe_amd64_of_update_zero
+	CMPQ  CX, $0x40
+	JAE   sequenceDecs_decodeSync_safe_amd64_of_update_zero
+	NEGQ  CX
+	SHRQ  CL, R14
+	ADDQ  R14, AX
+
+sequenceDecs_decodeSync_safe_amd64_of_update_zero:
+	MOVQ AX, 8(SP)
 
 	// Update match length
-	MOVQ    R8, AX
-	MOVQ    BX, CX
-	MOVQ    DX, R14
-	SHLQ    CL, R14
-	MOVB    AH, CL
-	ADDQ    CX, BX
-	NEGL    CX
-	SHRQ    CL, R14
-	SHRQ    $0x20, AX
-	TESTQ   CX, CX
-	CMOVQEQ CX, R14
-	ADDQ    R14, AX
-	MOVQ    AX, 16(SP)
+	MOVQ  R8, AX
+	MOVQ  BX, CX
+	MOVQ  DX, R14
+	SHLQ  CL, R14
+	MOVB  AH, CL
+	SHRQ  $0x20, AX
+	TESTQ CX, CX
+	JZ    sequenceDecs_decodeSync_safe_amd64_ml_update_zero
+	ADDQ  CX, BX
+	CMPQ  BX, $0x40
+	JA    sequenceDecs_decodeSync_safe_amd64_ml_update_zero
+	CMPQ  CX, $0x40
+	JAE   sequenceDecs_decodeSync_safe_amd64_ml_update_zero
+	NEGQ  CX
+	SHRQ  CL, R14
+	ADDQ  R14, AX
+
+sequenceDecs_decodeSync_safe_amd64_ml_update_zero:
+	MOVQ AX, 16(SP)
 
 	// Fill bitreader to have enough for the remaining
 	CMPQ SI, $0x08
@@ -2719,19 +2967,25 @@ sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte:
 
 sequenceDecs_decodeSync_safe_amd64_fill_2_end:
 	// Update literal length
-	MOVQ    DI, AX
-	MOVQ    BX, CX
-	MOVQ    DX, R14
-	SHLQ    CL, R14
-	MOVB    AH, CL
-	ADDQ    CX, BX
-	NEGL    CX
-	SHRQ    CL, R14
-	SHRQ    $0x20, AX
-	TESTQ   CX, CX
-	CMOVQEQ CX, R14
-	ADDQ    R14, AX
-	MOVQ    AX, 24(SP)
+	MOVQ  DI, AX
+	MOVQ  BX, CX
+	MOVQ  DX, R14
+	SHLQ  CL, R14
+	MOVB  AH, CL
+	SHRQ  $0x20, AX
+	TESTQ CX, CX
+	JZ    sequenceDecs_decodeSync_safe_amd64_ll_update_zero
+	ADDQ  CX, BX
+	CMPQ  BX, $0x40
+	JA    sequenceDecs_decodeSync_safe_amd64_ll_update_zero
+	CMPQ  CX, $0x40
+	JAE   sequenceDecs_decodeSync_safe_amd64_ll_update_zero
+	NEGQ  CX
+	SHRQ  CL, R14
+	ADDQ  R14, AX
+
+sequenceDecs_decodeSync_safe_amd64_ll_update_zero:
+	MOVQ AX, 24(SP)
 
 	// Fill bitreader for state updates
 	MOVQ    R13, (SP)
@@ -2746,18 +3000,17 @@ sequenceDecs_decodeSync_safe_amd64_fill_2_end:
 	MOVBQZX DI, R13
 	SHRQ    $0x10, DI
 	MOVWQZX DI, DI
-	CMPQ    R13, $0x00
-	JZ      sequenceDecs_decodeSync_safe_amd64_llState_updateState_skip_zero
-	MOVQ    BX, CX
-	ADDQ    R13, BX
+	LEAQ    (BX)(R13*1), CX
 	MOVQ    DX, R14
-	SHLQ    CL, R14
-	MOVQ    R13, CX
-	NEGQ    CX
-	SHRQ    CL, R14
+	MOVQ    CX, BX
+	ROLQ    CL, R14
+	MOVL    $0x00000001, R15
+	MOVB    R13, CL
+	SHLL    CL, R15
+	DECL    R15
+	ANDQ    R15, R14
 	ADDQ    R14, DI
 
-sequenceDecs_decodeSync_safe_amd64_llState_updateState_skip_zero:
 	// Load ctx.llTable
 	MOVQ ctx+16(FP), CX
 	MOVQ (CX), CX
@@ -2767,18 +3020,17 @@ sequenceDecs_decodeSync_safe_amd64_llState_updateState_skip_zero:
 	MOVBQZX R8, R13
 	SHRQ    $0x10, R8
 	MOVWQZX R8, R8
-	CMPQ    R13, $0x00
-	JZ      sequenceDecs_decodeSync_safe_amd64_mlState_updateState_skip_zero
-	MOVQ    BX, CX
-	ADDQ    R13, BX
+	LEAQ    (BX)(R13*1), CX
 	MOVQ    DX, R14
-	SHLQ    CL, R14
-	MOVQ    R13, CX
-	NEGQ    CX
-	SHRQ    CL, R14
+	MOVQ    CX, BX
+	ROLQ    CL, R14
+	MOVL    $0x00000001, R15
+	MOVB    R13, CL
+	SHLL    CL, R15
+	DECL    R15
+	ANDQ    R15, R14
 	ADDQ    R14, R8
 
-sequenceDecs_decodeSync_safe_amd64_mlState_updateState_skip_zero:
 	// Load ctx.mlTable
 	MOVQ ctx+16(FP), CX
 	MOVQ 24(CX), CX
@@ -2788,18 +3040,17 @@ sequenceDecs_decodeSync_safe_amd64_mlState_updateState_skip_zero:
 	MOVBQZX R9, R13
 	SHRQ    $0x10, R9
 	MOVWQZX R9, R9
-	CMPQ    R13, $0x00
-	JZ      sequenceDecs_decodeSync_safe_amd64_ofState_updateState_skip_zero
-	MOVQ    BX, CX
-	ADDQ    R13, BX
+	LEAQ    (BX)(R13*1), CX
 	MOVQ    DX, R14
-	SHLQ    CL, R14
-	MOVQ    R13, CX
-	NEGQ    CX
-	SHRQ    CL, R14
+	MOVQ    CX, BX
+	ROLQ    CL, R14
+	MOVL    $0x00000001, R15
+	MOVB    R13, CL
+	SHLL    CL, R15
+	DECL    R15
+	ANDQ    R15, R14
 	ADDQ    R14, R9
 
-sequenceDecs_decodeSync_safe_amd64_ofState_updateState_skip_zero:
 	// Load ctx.ofTable
 	MOVQ ctx+16(FP), CX
 	MOVQ 48(CX), CX
@@ -2814,7 +3065,7 @@ sequenceDecs_decodeSync_safe_amd64_skip_update:
 	MOVUPS 144(CX), X0
 	MOVQ   R13, 144(CX)
 	MOVUPS X0, 152(CX)
-	JMP    sequenceDecs_decodeSync_safe_amd64_adjust_end
+	JMP    sequenceDecs_decodeSync_safe_amd64_after_adjust
 
 sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0:
 	CMPQ 24(SP), $0x00000000
@@ -2826,7 +3077,7 @@ sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero:
 	TESTQ R13, R13
 	JNZ   sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero
 	MOVQ  144(CX), R13
-	JMP   sequenceDecs_decodeSync_safe_amd64_adjust_end
+	JMP   sequenceDecs_decodeSync_safe_amd64_after_adjust
 
 sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero:
 	MOVQ    R13, AX
@@ -2835,8 +3086,7 @@ sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero:
 	CMPQ    R13, $0x03
 	CMOVQEQ R14, AX
 	CMOVQEQ R15, R14
-	LEAQ    144(CX), R15
-	ADDQ    (R15)(AX*8), R14
+	ADDQ    144(CX)(AX*8), R14
 	JNZ     sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid
 	MOVQ    $0x00000001, R14
 
@@ -2852,7 +3102,7 @@ sequenceDecs_decodeSync_safe_amd64_adjust_skip:
 	MOVQ R14, 144(CX)
 	MOVQ R14, R13
 
-sequenceDecs_decodeSync_safe_amd64_adjust_end:
+sequenceDecs_decodeSync_safe_amd64_after_adjust:
 	MOVQ R13, 8(SP)
 
 	// Check values
@@ -2885,45 +3135,67 @@ sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok:
 	// Copy literals
 	TESTQ AX, AX
 	JZ    check_offset
-	XORQ  R14, R14
-	TESTQ $0x00000001, AX
-	JZ    copy_1_word
-	MOVB  (R11)(R14*1), R15
-	MOVB  R15, (R10)(R14*1)
-	ADDQ  $0x01, R14
-
-copy_1_word:
-	TESTQ $0x00000002, AX
-	JZ    copy_1_dword
-	MOVW  (R11)(R14*1), R15
-	MOVW  R15, (R10)(R14*1)
-	ADDQ  $0x02, R14
-
-copy_1_dword:
-	TESTQ $0x00000004, AX
-	JZ    copy_1_qword
-	MOVL  (R11)(R14*1), R15
-	MOVL  R15, (R10)(R14*1)
-	ADDQ  $0x04, R14
-
-copy_1_qword:
-	TESTQ $0x00000008, AX
-	JZ    copy_1_test
-	MOVQ  (R11)(R14*1), R15
-	MOVQ  R15, (R10)(R14*1)
-	ADDQ  $0x08, R14
-	JMP   copy_1_test
+	MOVQ  AX, R14
+	SUBQ  $0x10, R14
+	JB    copy_1_small
 
-copy_1:
-	MOVUPS (R11)(R14*1), X0
-	MOVUPS X0, (R10)(R14*1)
-	ADDQ   $0x10, R14
+copy_1_loop:
+	MOVUPS (R11), X0
+	MOVUPS X0, (R10)
+	ADDQ   $0x10, R11
+	ADDQ   $0x10, R10
+	SUBQ   $0x10, R14
+	JAE    copy_1_loop
+	LEAQ   16(R11)(R14*1), R11
+	LEAQ   16(R10)(R14*1), R10
+	MOVUPS -16(R11), X0
+	MOVUPS X0, -16(R10)
+	JMP    copy_1_end
+
+copy_1_small:
+	CMPQ AX, $0x03
+	JE   copy_1_move_3
+	JB   copy_1_move_1or2
+	CMPQ AX, $0x08
+	JB   copy_1_move_4through7
+	JMP  copy_1_move_8through16
+
+copy_1_move_1or2:
+	MOVB (R11), R14
+	MOVB -1(R11)(AX*1), R15
+	MOVB R14, (R10)
+	MOVB R15, -1(R10)(AX*1)
+	ADDQ AX, R11
+	ADDQ AX, R10
+	JMP  copy_1_end
+
+copy_1_move_3:
+	MOVW (R11), R14
+	MOVB 2(R11), R15
+	MOVW R14, (R10)
+	MOVB R15, 2(R10)
+	ADDQ AX, R11
+	ADDQ AX, R10
+	JMP  copy_1_end
+
+copy_1_move_4through7:
+	MOVL (R11), R14
+	MOVL -4(R11)(AX*1), R15
+	MOVL R14, (R10)
+	MOVL R15, -4(R10)(AX*1)
+	ADDQ AX, R11
+	ADDQ AX, R10
+	JMP  copy_1_end
 
-copy_1_test:
-	CMPQ R14, AX
-	JB   copy_1
+copy_1_move_8through16:
+	MOVQ (R11), R14
+	MOVQ -8(R11)(AX*1), R15
+	MOVQ R14, (R10)
+	MOVQ R15, -8(R10)(AX*1)
 	ADDQ AX, R11
 	ADDQ AX, R10
+
+copy_1_end:
 	ADDQ AX, R12
 
 	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
@@ -2936,149 +3208,206 @@ check_offset:
 	JG   error_match_off_too_big
 
 	// Copy match from history
-	MOVQ  CX, AX
-	SUBQ  R12, AX
-	JLS   copy_match
-	MOVQ  48(SP), R14
-	SUBQ  AX, R14
-	CMPQ  R13, AX
-	JGE   copy_all_from_history
-	XORQ  AX, AX
-	TESTQ $0x00000001, R13
-	JZ    copy_4_word
-	MOVB  (R14)(AX*1), CL
-	MOVB  CL, (R10)(AX*1)
-	ADDQ  $0x01, AX
-
-copy_4_word:
-	TESTQ $0x00000002, R13
-	JZ    copy_4_dword
-	MOVW  (R14)(AX*1), CX
-	MOVW  CX, (R10)(AX*1)
-	ADDQ  $0x02, AX
-
-copy_4_dword:
-	TESTQ $0x00000004, R13
-	JZ    copy_4_qword
-	MOVL  (R14)(AX*1), CX
-	MOVL  CX, (R10)(AX*1)
-	ADDQ  $0x04, AX
-
-copy_4_qword:
-	TESTQ $0x00000008, R13
-	JZ    copy_4_test
-	MOVQ  (R14)(AX*1), CX
-	MOVQ  CX, (R10)(AX*1)
-	ADDQ  $0x08, AX
-	JMP   copy_4_test
-
-copy_4:
-	MOVUPS (R14)(AX*1), X0
-	MOVUPS X0, (R10)(AX*1)
-	ADDQ   $0x10, AX
+	MOVQ CX, AX
+	SUBQ R12, AX
+	JLS  copy_match
+	MOVQ 48(SP), R14
+	SUBQ AX, R14
+	CMPQ R13, AX
+	JG   copy_all_from_history
+	MOVQ R13, AX
+	SUBQ $0x10, AX
+	JB   copy_4_small
 
-copy_4_test:
-	CMPQ AX, R13
-	JB   copy_4
-	ADDQ R13, R12
+copy_4_loop:
+	MOVUPS (R14), X0
+	MOVUPS X0, (R10)
+	ADDQ   $0x10, R14
+	ADDQ   $0x10, R10
+	SUBQ   $0x10, AX
+	JAE    copy_4_loop
+	LEAQ   16(R14)(AX*1), R14
+	LEAQ   16(R10)(AX*1), R10
+	MOVUPS -16(R14), X0
+	MOVUPS X0, -16(R10)
+	JMP    copy_4_end
+
+copy_4_small:
+	CMPQ R13, $0x03
+	JE   copy_4_move_3
+	CMPQ R13, $0x08
+	JB   copy_4_move_4through7
+	JMP  copy_4_move_8through16
+
+copy_4_move_3:
+	MOVW (R14), AX
+	MOVB 2(R14), CL
+	MOVW AX, (R10)
+	MOVB CL, 2(R10)
+	ADDQ R13, R14
+	ADDQ R13, R10
+	JMP  copy_4_end
+
+copy_4_move_4through7:
+	MOVL (R14), AX
+	MOVL -4(R14)(R13*1), CX
+	MOVL AX, (R10)
+	MOVL CX, -4(R10)(R13*1)
+	ADDQ R13, R14
 	ADDQ R13, R10
+	JMP  copy_4_end
+
+copy_4_move_8through16:
+	MOVQ (R14), AX
+	MOVQ -8(R14)(R13*1), CX
+	MOVQ AX, (R10)
+	MOVQ CX, -8(R10)(R13*1)
+	ADDQ R13, R14
+	ADDQ R13, R10
+
+copy_4_end:
+	ADDQ R13, R12
 	JMP  handle_loop
 	JMP loop_finished
 
 copy_all_from_history:
-	XORQ  R15, R15
-	TESTQ $0x00000001, AX
-	JZ    copy_5_word
-	MOVB  (R14)(R15*1), BP
-	MOVB  BP, (R10)(R15*1)
-	ADDQ  $0x01, R15
-
-copy_5_word:
-	TESTQ $0x00000002, AX
-	JZ    copy_5_dword
-	MOVW  (R14)(R15*1), BP
-	MOVW  BP, (R10)(R15*1)
-	ADDQ  $0x02, R15
-
-copy_5_dword:
-	TESTQ $0x00000004, AX
-	JZ    copy_5_qword
-	MOVL  (R14)(R15*1), BP
-	MOVL  BP, (R10)(R15*1)
-	ADDQ  $0x04, R15
-
-copy_5_qword:
-	TESTQ $0x00000008, AX
-	JZ    copy_5_test
-	MOVQ  (R14)(R15*1), BP
-	MOVQ  BP, (R10)(R15*1)
-	ADDQ  $0x08, R15
-	JMP   copy_5_test
-
-copy_5:
-	MOVUPS (R14)(R15*1), X0
-	MOVUPS X0, (R10)(R15*1)
-	ADDQ   $0x10, R15
-
-copy_5_test:
-	CMPQ R15, AX
-	JB   copy_5
+	MOVQ AX, R15
+	SUBQ $0x10, R15
+	JB   copy_5_small
+
+copy_5_loop:
+	MOVUPS (R14), X0
+	MOVUPS X0, (R10)
+	ADDQ   $0x10, R14
+	ADDQ   $0x10, R10
+	SUBQ   $0x10, R15
+	JAE    copy_5_loop
+	LEAQ   16(R14)(R15*1), R14
+	LEAQ   16(R10)(R15*1), R10
+	MOVUPS -16(R14), X0
+	MOVUPS X0, -16(R10)
+	JMP    copy_5_end
+
+copy_5_small:
+	CMPQ AX, $0x03
+	JE   copy_5_move_3
+	JB   copy_5_move_1or2
+	CMPQ AX, $0x08
+	JB   copy_5_move_4through7
+	JMP  copy_5_move_8through16
+
+copy_5_move_1or2:
+	MOVB (R14), R15
+	MOVB -1(R14)(AX*1), BP
+	MOVB R15, (R10)
+	MOVB BP, -1(R10)(AX*1)
+	ADDQ AX, R14
+	ADDQ AX, R10
+	JMP  copy_5_end
+
+copy_5_move_3:
+	MOVW (R14), R15
+	MOVB 2(R14), BP
+	MOVW R15, (R10)
+	MOVB BP, 2(R10)
+	ADDQ AX, R14
 	ADDQ AX, R10
+	JMP  copy_5_end
+
+copy_5_move_4through7:
+	MOVL (R14), R15
+	MOVL -4(R14)(AX*1), BP
+	MOVL R15, (R10)
+	MOVL BP, -4(R10)(AX*1)
+	ADDQ AX, R14
+	ADDQ AX, R10
+	JMP  copy_5_end
+
+copy_5_move_8through16:
+	MOVQ (R14), R15
+	MOVQ -8(R14)(AX*1), BP
+	MOVQ R15, (R10)
+	MOVQ BP, -8(R10)(AX*1)
+	ADDQ AX, R14
+	ADDQ AX, R10
+
+copy_5_end:
 	ADDQ AX, R12
 	SUBQ AX, R13
 
 	// Copy match from the current buffer
 copy_match:
-	TESTQ R13, R13
-	JZ    handle_loop
-	MOVQ  R10, AX
-	SUBQ  CX, AX
+	MOVQ R10, AX
+	SUBQ CX, AX
 
 	// ml <= mo
 	CMPQ R13, CX
 	JA   copy_overlapping_match
 
 	// Copy non-overlapping match
-	ADDQ  R13, R12
-	XORQ  CX, CX
-	TESTQ $0x00000001, R13
-	JZ    copy_2_word
-	MOVB  (AX)(CX*1), R14
-	MOVB  R14, (R10)(CX*1)
-	ADDQ  $0x01, CX
-
-copy_2_word:
-	TESTQ $0x00000002, R13
-	JZ    copy_2_dword
-	MOVW  (AX)(CX*1), R14
-	MOVW  R14, (R10)(CX*1)
-	ADDQ  $0x02, CX
-
-copy_2_dword:
-	TESTQ $0x00000004, R13
-	JZ    copy_2_qword
-	MOVL  (AX)(CX*1), R14
-	MOVL  R14, (R10)(CX*1)
-	ADDQ  $0x04, CX
-
-copy_2_qword:
-	TESTQ $0x00000008, R13
-	JZ    copy_2_test
-	MOVQ  (AX)(CX*1), R14
-	MOVQ  R14, (R10)(CX*1)
-	ADDQ  $0x08, CX
-	JMP   copy_2_test
-
-copy_2:
-	MOVUPS (AX)(CX*1), X0
-	MOVUPS X0, (R10)(CX*1)
-	ADDQ   $0x10, CX
+	ADDQ R13, R12
+	MOVQ R13, CX
+	SUBQ $0x10, CX
+	JB   copy_2_small
 
-copy_2_test:
-	CMPQ CX, R13
-	JB   copy_2
+copy_2_loop:
+	MOVUPS (AX), X0
+	MOVUPS X0, (R10)
+	ADDQ   $0x10, AX
+	ADDQ   $0x10, R10
+	SUBQ   $0x10, CX
+	JAE    copy_2_loop
+	LEAQ   16(AX)(CX*1), AX
+	LEAQ   16(R10)(CX*1), R10
+	MOVUPS -16(AX), X0
+	MOVUPS X0, -16(R10)
+	JMP    copy_2_end
+
+copy_2_small:
+	CMPQ R13, $0x03
+	JE   copy_2_move_3
+	JB   copy_2_move_1or2
+	CMPQ R13, $0x08
+	JB   copy_2_move_4through7
+	JMP  copy_2_move_8through16
+
+copy_2_move_1or2:
+	MOVB (AX), CL
+	MOVB -1(AX)(R13*1), R14
+	MOVB CL, (R10)
+	MOVB R14, -1(R10)(R13*1)
+	ADDQ R13, AX
 	ADDQ R13, R10
-	JMP  handle_loop
+	JMP  copy_2_end
+
+copy_2_move_3:
+	MOVW (AX), CX
+	MOVB 2(AX), R14
+	MOVW CX, (R10)
+	MOVB R14, 2(R10)
+	ADDQ R13, AX
+	ADDQ R13, R10
+	JMP  copy_2_end
+
+copy_2_move_4through7:
+	MOVL (AX), CX
+	MOVL -4(AX)(R13*1), R14
+	MOVL CX, (R10)
+	MOVL R14, -4(R10)(R13*1)
+	ADDQ R13, AX
+	ADDQ R13, R10
+	JMP  copy_2_end
+
+copy_2_move_8through16:
+	MOVQ (AX), CX
+	MOVQ -8(AX)(R13*1), R14
+	MOVQ CX, (R10)
+	MOVQ R14, -8(R10)(R13*1)
+	ADDQ R13, AX
+	ADDQ R13, R10
+
+copy_2_end:
+	JMP handle_loop
 
 	// Copy overlapping match
 copy_overlapping_match:
@@ -3172,6 +3501,10 @@ TEXT ·sequenceDecs_decodeSync_safe_bmi2(SB), $64-32
 	MOVQ    72(CX), SI
 	MOVQ    80(CX), DI
 	MOVQ    88(CX), R8
+	XORQ    R9, R9
+	MOVQ    R9, 8(SP)
+	MOVQ    R9, 16(SP)
+	MOVQ    R9, 24(SP)
 	MOVQ    112(CX), R9
 	MOVQ    128(CX), R10
 	MOVQ    R10, 32(SP)
@@ -3344,7 +3677,7 @@ sequenceDecs_decodeSync_safe_bmi2_skip_update:
 	MOVUPS 144(CX), X0
 	MOVQ   R13, 144(CX)
 	MOVUPS X0, 152(CX)
-	JMP    sequenceDecs_decodeSync_safe_bmi2_adjust_end
+	JMP    sequenceDecs_decodeSync_safe_bmi2_after_adjust
 
 sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0:
 	CMPQ 24(SP), $0x00000000
@@ -3356,7 +3689,7 @@ sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero:
 	TESTQ R13, R13
 	JNZ   sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero
 	MOVQ  144(CX), R13
-	JMP   sequenceDecs_decodeSync_safe_bmi2_adjust_end
+	JMP   sequenceDecs_decodeSync_safe_bmi2_after_adjust
 
 sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero:
 	MOVQ    R13, R12
@@ -3365,8 +3698,7 @@ sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero:
 	CMPQ    R13, $0x03
 	CMOVQEQ R14, R12
 	CMOVQEQ R15, R14
-	LEAQ    144(CX), R15
-	ADDQ    (R15)(R12*8), R14
+	ADDQ    144(CX)(R12*8), R14
 	JNZ     sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid
 	MOVQ    $0x00000001, R14
 
@@ -3382,7 +3714,7 @@ sequenceDecs_decodeSync_safe_bmi2_adjust_skip:
 	MOVQ R14, 144(CX)
 	MOVQ R14, R13
 
-sequenceDecs_decodeSync_safe_bmi2_adjust_end:
+sequenceDecs_decodeSync_safe_bmi2_after_adjust:
 	MOVQ R13, 8(SP)
 
 	// Check values
@@ -3415,45 +3747,67 @@ sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok:
 	// Copy literals
 	TESTQ CX, CX
 	JZ    check_offset
-	XORQ  R14, R14
-	TESTQ $0x00000001, CX
-	JZ    copy_1_word
-	MOVB  (R10)(R14*1), R15
-	MOVB  R15, (R9)(R14*1)
-	ADDQ  $0x01, R14
-
-copy_1_word:
-	TESTQ $0x00000002, CX
-	JZ    copy_1_dword
-	MOVW  (R10)(R14*1), R15
-	MOVW  R15, (R9)(R14*1)
-	ADDQ  $0x02, R14
-
-copy_1_dword:
-	TESTQ $0x00000004, CX
-	JZ    copy_1_qword
-	MOVL  (R10)(R14*1), R15
-	MOVL  R15, (R9)(R14*1)
-	ADDQ  $0x04, R14
-
-copy_1_qword:
-	TESTQ $0x00000008, CX
-	JZ    copy_1_test
-	MOVQ  (R10)(R14*1), R15
-	MOVQ  R15, (R9)(R14*1)
-	ADDQ  $0x08, R14
-	JMP   copy_1_test
+	MOVQ  CX, R14
+	SUBQ  $0x10, R14
+	JB    copy_1_small
+
+copy_1_loop:
+	MOVUPS (R10), X0
+	MOVUPS X0, (R9)
+	ADDQ   $0x10, R10
+	ADDQ   $0x10, R9
+	SUBQ   $0x10, R14
+	JAE    copy_1_loop
+	LEAQ   16(R10)(R14*1), R10
+	LEAQ   16(R9)(R14*1), R9
+	MOVUPS -16(R10), X0
+	MOVUPS X0, -16(R9)
+	JMP    copy_1_end
+
+copy_1_small:
+	CMPQ CX, $0x03
+	JE   copy_1_move_3
+	JB   copy_1_move_1or2
+	CMPQ CX, $0x08
+	JB   copy_1_move_4through7
+	JMP  copy_1_move_8through16
+
+copy_1_move_1or2:
+	MOVB (R10), R14
+	MOVB -1(R10)(CX*1), R15
+	MOVB R14, (R9)
+	MOVB R15, -1(R9)(CX*1)
+	ADDQ CX, R10
+	ADDQ CX, R9
+	JMP  copy_1_end
 
-copy_1:
-	MOVUPS (R10)(R14*1), X0
-	MOVUPS X0, (R9)(R14*1)
-	ADDQ   $0x10, R14
+copy_1_move_3:
+	MOVW (R10), R14
+	MOVB 2(R10), R15
+	MOVW R14, (R9)
+	MOVB R15, 2(R9)
+	ADDQ CX, R10
+	ADDQ CX, R9
+	JMP  copy_1_end
+
+copy_1_move_4through7:
+	MOVL (R10), R14
+	MOVL -4(R10)(CX*1), R15
+	MOVL R14, (R9)
+	MOVL R15, -4(R9)(CX*1)
+	ADDQ CX, R10
+	ADDQ CX, R9
+	JMP  copy_1_end
 
-copy_1_test:
-	CMPQ R14, CX
-	JB   copy_1
+copy_1_move_8through16:
+	MOVQ (R10), R14
+	MOVQ -8(R10)(CX*1), R15
+	MOVQ R14, (R9)
+	MOVQ R15, -8(R9)(CX*1)
 	ADDQ CX, R10
 	ADDQ CX, R9
+
+copy_1_end:
 	ADDQ CX, R11
 
 	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
@@ -3466,149 +3820,206 @@ check_offset:
 	JG   error_match_off_too_big
 
 	// Copy match from history
-	MOVQ  R12, CX
-	SUBQ  R11, CX
-	JLS   copy_match
-	MOVQ  48(SP), R14
-	SUBQ  CX, R14
-	CMPQ  R13, CX
-	JGE   copy_all_from_history
-	XORQ  CX, CX
-	TESTQ $0x00000001, R13
-	JZ    copy_4_word
-	MOVB  (R14)(CX*1), R12
-	MOVB  R12, (R9)(CX*1)
-	ADDQ  $0x01, CX
-
-copy_4_word:
-	TESTQ $0x00000002, R13
-	JZ    copy_4_dword
-	MOVW  (R14)(CX*1), R12
-	MOVW  R12, (R9)(CX*1)
-	ADDQ  $0x02, CX
-
-copy_4_dword:
-	TESTQ $0x00000004, R13
-	JZ    copy_4_qword
-	MOVL  (R14)(CX*1), R12
-	MOVL  R12, (R9)(CX*1)
-	ADDQ  $0x04, CX
-
-copy_4_qword:
-	TESTQ $0x00000008, R13
-	JZ    copy_4_test
-	MOVQ  (R14)(CX*1), R12
-	MOVQ  R12, (R9)(CX*1)
-	ADDQ  $0x08, CX
-	JMP   copy_4_test
-
-copy_4:
-	MOVUPS (R14)(CX*1), X0
-	MOVUPS X0, (R9)(CX*1)
-	ADDQ   $0x10, CX
+	MOVQ R12, CX
+	SUBQ R11, CX
+	JLS  copy_match
+	MOVQ 48(SP), R14
+	SUBQ CX, R14
+	CMPQ R13, CX
+	JG   copy_all_from_history
+	MOVQ R13, CX
+	SUBQ $0x10, CX
+	JB   copy_4_small
+
+copy_4_loop:
+	MOVUPS (R14), X0
+	MOVUPS X0, (R9)
+	ADDQ   $0x10, R14
+	ADDQ   $0x10, R9
+	SUBQ   $0x10, CX
+	JAE    copy_4_loop
+	LEAQ   16(R14)(CX*1), R14
+	LEAQ   16(R9)(CX*1), R9
+	MOVUPS -16(R14), X0
+	MOVUPS X0, -16(R9)
+	JMP    copy_4_end
+
+copy_4_small:
+	CMPQ R13, $0x03
+	JE   copy_4_move_3
+	CMPQ R13, $0x08
+	JB   copy_4_move_4through7
+	JMP  copy_4_move_8through16
+
+copy_4_move_3:
+	MOVW (R14), CX
+	MOVB 2(R14), R12
+	MOVW CX, (R9)
+	MOVB R12, 2(R9)
+	ADDQ R13, R14
+	ADDQ R13, R9
+	JMP  copy_4_end
+
+copy_4_move_4through7:
+	MOVL (R14), CX
+	MOVL -4(R14)(R13*1), R12
+	MOVL CX, (R9)
+	MOVL R12, -4(R9)(R13*1)
+	ADDQ R13, R14
+	ADDQ R13, R9
+	JMP  copy_4_end
+
+copy_4_move_8through16:
+	MOVQ (R14), CX
+	MOVQ -8(R14)(R13*1), R12
+	MOVQ CX, (R9)
+	MOVQ R12, -8(R9)(R13*1)
+	ADDQ R13, R14
+	ADDQ R13, R9
 
-copy_4_test:
-	CMPQ CX, R13
-	JB   copy_4
+copy_4_end:
 	ADDQ R13, R11
-	ADDQ R13, R9
 	JMP  handle_loop
 	JMP loop_finished
 
 copy_all_from_history:
-	XORQ  R15, R15
-	TESTQ $0x00000001, CX
-	JZ    copy_5_word
-	MOVB  (R14)(R15*1), BP
-	MOVB  BP, (R9)(R15*1)
-	ADDQ  $0x01, R15
-
-copy_5_word:
-	TESTQ $0x00000002, CX
-	JZ    copy_5_dword
-	MOVW  (R14)(R15*1), BP
-	MOVW  BP, (R9)(R15*1)
-	ADDQ  $0x02, R15
-
-copy_5_dword:
-	TESTQ $0x00000004, CX
-	JZ    copy_5_qword
-	MOVL  (R14)(R15*1), BP
-	MOVL  BP, (R9)(R15*1)
-	ADDQ  $0x04, R15
-
-copy_5_qword:
-	TESTQ $0x00000008, CX
-	JZ    copy_5_test
-	MOVQ  (R14)(R15*1), BP
-	MOVQ  BP, (R9)(R15*1)
-	ADDQ  $0x08, R15
-	JMP   copy_5_test
-
-copy_5:
-	MOVUPS (R14)(R15*1), X0
-	MOVUPS X0, (R9)(R15*1)
-	ADDQ   $0x10, R15
-
-copy_5_test:
-	CMPQ R15, CX
-	JB   copy_5
+	MOVQ CX, R15
+	SUBQ $0x10, R15
+	JB   copy_5_small
+
+copy_5_loop:
+	MOVUPS (R14), X0
+	MOVUPS X0, (R9)
+	ADDQ   $0x10, R14
+	ADDQ   $0x10, R9
+	SUBQ   $0x10, R15
+	JAE    copy_5_loop
+	LEAQ   16(R14)(R15*1), R14
+	LEAQ   16(R9)(R15*1), R9
+	MOVUPS -16(R14), X0
+	MOVUPS X0, -16(R9)
+	JMP    copy_5_end
+
+copy_5_small:
+	CMPQ CX, $0x03
+	JE   copy_5_move_3
+	JB   copy_5_move_1or2
+	CMPQ CX, $0x08
+	JB   copy_5_move_4through7
+	JMP  copy_5_move_8through16
+
+copy_5_move_1or2:
+	MOVB (R14), R15
+	MOVB -1(R14)(CX*1), BP
+	MOVB R15, (R9)
+	MOVB BP, -1(R9)(CX*1)
+	ADDQ CX, R14
+	ADDQ CX, R9
+	JMP  copy_5_end
+
+copy_5_move_3:
+	MOVW (R14), R15
+	MOVB 2(R14), BP
+	MOVW R15, (R9)
+	MOVB BP, 2(R9)
+	ADDQ CX, R14
+	ADDQ CX, R9
+	JMP  copy_5_end
+
+copy_5_move_4through7:
+	MOVL (R14), R15
+	MOVL -4(R14)(CX*1), BP
+	MOVL R15, (R9)
+	MOVL BP, -4(R9)(CX*1)
+	ADDQ CX, R14
 	ADDQ CX, R9
+	JMP  copy_5_end
+
+copy_5_move_8through16:
+	MOVQ (R14), R15
+	MOVQ -8(R14)(CX*1), BP
+	MOVQ R15, (R9)
+	MOVQ BP, -8(R9)(CX*1)
+	ADDQ CX, R14
+	ADDQ CX, R9
+
+copy_5_end:
 	ADDQ CX, R11
 	SUBQ CX, R13
 
 	// Copy match from the current buffer
 copy_match:
-	TESTQ R13, R13
-	JZ    handle_loop
-	MOVQ  R9, CX
-	SUBQ  R12, CX
+	MOVQ R9, CX
+	SUBQ R12, CX
 
 	// ml <= mo
 	CMPQ R13, R12
 	JA   copy_overlapping_match
 
 	// Copy non-overlapping match
-	ADDQ  R13, R11
-	XORQ  R12, R12
-	TESTQ $0x00000001, R13
-	JZ    copy_2_word
-	MOVB  (CX)(R12*1), R14
-	MOVB  R14, (R9)(R12*1)
-	ADDQ  $0x01, R12
-
-copy_2_word:
-	TESTQ $0x00000002, R13
-	JZ    copy_2_dword
-	MOVW  (CX)(R12*1), R14
-	MOVW  R14, (R9)(R12*1)
-	ADDQ  $0x02, R12
-
-copy_2_dword:
-	TESTQ $0x00000004, R13
-	JZ    copy_2_qword
-	MOVL  (CX)(R12*1), R14
-	MOVL  R14, (R9)(R12*1)
-	ADDQ  $0x04, R12
-
-copy_2_qword:
-	TESTQ $0x00000008, R13
-	JZ    copy_2_test
-	MOVQ  (CX)(R12*1), R14
-	MOVQ  R14, (R9)(R12*1)
-	ADDQ  $0x08, R12
-	JMP   copy_2_test
-
-copy_2:
-	MOVUPS (CX)(R12*1), X0
-	MOVUPS X0, (R9)(R12*1)
-	ADDQ   $0x10, R12
+	ADDQ R13, R11
+	MOVQ R13, R12
+	SUBQ $0x10, R12
+	JB   copy_2_small
 
-copy_2_test:
-	CMPQ R12, R13
-	JB   copy_2
+copy_2_loop:
+	MOVUPS (CX), X0
+	MOVUPS X0, (R9)
+	ADDQ   $0x10, CX
+	ADDQ   $0x10, R9
+	SUBQ   $0x10, R12
+	JAE    copy_2_loop
+	LEAQ   16(CX)(R12*1), CX
+	LEAQ   16(R9)(R12*1), R9
+	MOVUPS -16(CX), X0
+	MOVUPS X0, -16(R9)
+	JMP    copy_2_end
+
+copy_2_small:
+	CMPQ R13, $0x03
+	JE   copy_2_move_3
+	JB   copy_2_move_1or2
+	CMPQ R13, $0x08
+	JB   copy_2_move_4through7
+	JMP  copy_2_move_8through16
+
+copy_2_move_1or2:
+	MOVB (CX), R12
+	MOVB -1(CX)(R13*1), R14
+	MOVB R12, (R9)
+	MOVB R14, -1(R9)(R13*1)
+	ADDQ R13, CX
 	ADDQ R13, R9
-	JMP  handle_loop
+	JMP  copy_2_end
+
+copy_2_move_3:
+	MOVW (CX), R12
+	MOVB 2(CX), R14
+	MOVW R12, (R9)
+	MOVB R14, 2(R9)
+	ADDQ R13, CX
+	ADDQ R13, R9
+	JMP  copy_2_end
+
+copy_2_move_4through7:
+	MOVL (CX), R12
+	MOVL -4(CX)(R13*1), R14
+	MOVL R12, (R9)
+	MOVL R14, -4(R9)(R13*1)
+	ADDQ R13, CX
+	ADDQ R13, R9
+	JMP  copy_2_end
+
+copy_2_move_8through16:
+	MOVQ (CX), R12
+	MOVQ -8(CX)(R13*1), R14
+	MOVQ R12, (R9)
+	MOVQ R14, -8(R9)(R13*1)
+	ADDQ R13, CX
+	ADDQ R13, R9
+
+copy_2_end:
+	JMP handle_loop
 
 	// Copy overlapping match
 copy_overlapping_match:
diff --git a/vendor/github.com/segmentio/kafka-go/compress/gzip/gzip.go b/vendor/github.com/segmentio/kafka-go/compress/gzip/gzip.go
index 64da3129d..ad5009c39 100644
--- a/vendor/github.com/segmentio/kafka-go/compress/gzip/gzip.go
+++ b/vendor/github.com/segmentio/kafka-go/compress/gzip/gzip.go
@@ -1,9 +1,10 @@
 package gzip
 
 import (
-	"compress/gzip"
 	"io"
 	"sync"
+
+	"github.com/klauspost/compress/gzip"
 )
 
 var (
diff --git a/vendor/github.com/segmentio/kafka-go/compress/snappy/snappy.go b/vendor/github.com/segmentio/kafka-go/compress/snappy/snappy.go
index a726ecebc..5bc6194f1 100644
--- a/vendor/github.com/segmentio/kafka-go/compress/snappy/snappy.go
+++ b/vendor/github.com/segmentio/kafka-go/compress/snappy/snappy.go
@@ -4,6 +4,7 @@ import (
 	"io"
 	"sync"
 
+	"github.com/klauspost/compress/s2"
 	"github.com/klauspost/compress/snappy"
 )
 
@@ -16,6 +17,16 @@ const (
 	Unframed
 )
 
+// Compression level.
+type Compression int
+
+const (
+	DefaultCompression Compression = iota
+	FasterCompression
+	BetterCompression
+	BestCompression
+)
+
 var (
 	readerPool sync.Pool
 	writerPool sync.Pool
@@ -28,6 +39,9 @@ type Codec struct {
 	//
 	// Default to Framed.
 	Framing Framing
+
+	// Compression level.
+	Compression Compression
 }
 
 // Code implements the compress.Codec interface.
@@ -56,12 +70,19 @@ func (c *Codec) NewWriter(w io.Writer) io.WriteCloser {
 	if x != nil {
 		x.Reset(w)
 	} else {
-		x = &xerialWriter{
-			writer: w,
-			encode: snappy.Encode,
-		}
+		x = &xerialWriter{writer: w}
 	}
 	x.framed = c.Framing == Framed
+	switch c.Compression {
+	case FasterCompression:
+		x.encode = s2.EncodeSnappy
+	case BetterCompression:
+		x.encode = s2.EncodeSnappyBetter
+	case BestCompression:
+		x.encode = s2.EncodeSnappyBest
+	default:
+		x.encode = snappy.Encode // aka. s2.EncodeSnappyBetter
+	}
 	return &writer{xerialWriter: x}
 }
 
diff --git a/vendor/github.com/segmentio/kafka-go/conn.go b/vendor/github.com/segmentio/kafka-go/conn.go
index e32dc2163..aa84f5a49 100644
--- a/vendor/github.com/segmentio/kafka-go/conn.go
+++ b/vendor/github.com/segmentio/kafka-go/conn.go
@@ -971,48 +971,101 @@ func (c *Conn) ReadPartitions(topics ...string) (partitions []Partition, err err
 			topics = nil
 		}
 	}
+	metadataVersion, err := c.negotiateVersion(metadata, v1, v6)
+	if err != nil {
+		return nil, err
+	}
 
 	err = c.readOperation(
 		func(deadline time.Time, id int32) error {
-			return c.writeRequest(metadata, v1, id, topicMetadataRequestV1(topics))
+			switch metadataVersion {
+			case v6:
+				return c.writeRequest(metadata, v6, id, topicMetadataRequestV6{Topics: topics, AllowAutoTopicCreation: true})
+			default:
+				return c.writeRequest(metadata, v1, id, topicMetadataRequestV1(topics))
+			}
 		},
 		func(deadline time.Time, size int) error {
-			var res metadataResponseV1
+			partitions, err = c.readPartitionsResponse(metadataVersion, size)
+			return err
+		},
+	)
+	return
+}
 
-			if err := c.readResponse(size, &res); err != nil {
-				return err
-			}
+func (c *Conn) readPartitionsResponse(metadataVersion apiVersion, size int) ([]Partition, error) {
+	switch metadataVersion {
+	case v6:
+		var res metadataResponseV6
+		if err := c.readResponse(size, &res); err != nil {
+			return nil, err
+		}
+		brokers := readBrokerMetadata(res.Brokers)
+		return c.readTopicMetadatav6(brokers, res.Topics)
+	default:
+		var res metadataResponseV1
+		if err := c.readResponse(size, &res); err != nil {
+			return nil, err
+		}
+		brokers := readBrokerMetadata(res.Brokers)
+		return c.readTopicMetadatav1(brokers, res.Topics)
+	}
+}
 
-			brokers := make(map[int32]Broker, len(res.Brokers))
-			for _, b := range res.Brokers {
-				brokers[b.NodeID] = Broker{
-					Host: b.Host,
-					Port: int(b.Port),
-					ID:   int(b.NodeID),
-					Rack: b.Rack,
-				}
-			}
+func readBrokerMetadata(brokerMetadata []brokerMetadataV1) map[int32]Broker {
+	brokers := make(map[int32]Broker, len(brokerMetadata))
+	for _, b := range brokerMetadata {
+		brokers[b.NodeID] = Broker{
+			Host: b.Host,
+			Port: int(b.Port),
+			ID:   int(b.NodeID),
+			Rack: b.Rack,
+		}
+	}
+	return brokers
+}
 
-			for _, t := range res.Topics {
-				if t.TopicErrorCode != 0 && (c.topic == "" || t.TopicName == c.topic) {
-					// We only report errors if they happened for the topic of
-					// the connection, otherwise the topic will simply have no
-					// partitions in the result set.
-					return Error(t.TopicErrorCode)
-				}
-				for _, p := range t.Partitions {
-					partitions = append(partitions, Partition{
-						Topic:    t.TopicName,
-						Leader:   brokers[p.Leader],
-						Replicas: makeBrokers(brokers, p.Replicas...),
-						Isr:      makeBrokers(brokers, p.Isr...),
-						ID:       int(p.PartitionID),
-					})
-				}
-			}
-			return nil
-		},
-	)
+func (c *Conn) readTopicMetadatav1(brokers map[int32]Broker, topicMetadata []topicMetadataV1) (partitions []Partition, err error) {
+	for _, t := range topicMetadata {
+		if t.TopicErrorCode != 0 && (c.topic == "" || t.TopicName == c.topic) {
+			// We only report errors if they happened for the topic of
+			// the connection, otherwise the topic will simply have no
+			// partitions in the result set.
+			return nil, Error(t.TopicErrorCode)
+		}
+		for _, p := range t.Partitions {
+			partitions = append(partitions, Partition{
+				Topic:           t.TopicName,
+				Leader:          brokers[p.Leader],
+				Replicas:        makeBrokers(brokers, p.Replicas...),
+				Isr:             makeBrokers(brokers, p.Isr...),
+				ID:              int(p.PartitionID),
+				OfflineReplicas: []Broker{},
+			})
+		}
+	}
+	return
+}
+
+func (c *Conn) readTopicMetadatav6(brokers map[int32]Broker, topicMetadata []topicMetadataV6) (partitions []Partition, err error) {
+	for _, t := range topicMetadata {
+		if t.TopicErrorCode != 0 && (c.topic == "" || t.TopicName == c.topic) {
+			// We only report errors if they happened for the topic of
+			// the connection, otherwise the topic will simply have no
+			// partitions in the result set.
+			return nil, Error(t.TopicErrorCode)
+		}
+		for _, p := range t.Partitions {
+			partitions = append(partitions, Partition{
+				Topic:           t.TopicName,
+				Leader:          brokers[p.Leader],
+				Replicas:        makeBrokers(brokers, p.Replicas...),
+				Isr:             makeBrokers(brokers, p.Isr...),
+				ID:              int(p.PartitionID),
+				OfflineReplicas: makeBrokers(brokers, p.OfflineReplicas...),
+			})
+		}
+	}
 	return
 }
 
diff --git a/vendor/github.com/segmentio/kafka-go/kafka.go b/vendor/github.com/segmentio/kafka-go/kafka.go
index ec139ac91..d2d36e413 100644
--- a/vendor/github.com/segmentio/kafka-go/kafka.go
+++ b/vendor/github.com/segmentio/kafka-go/kafka.go
@@ -47,6 +47,9 @@ type Partition struct {
 	Replicas []Broker
 	Isr      []Broker
 
+	// Available only with metadata API level >= 6:
+	OfflineReplicas []Broker
+
 	// An error that may have occurred while attempting to read the partition
 	// metadata.
 	//
diff --git a/vendor/github.com/segmentio/kafka-go/metadata.go b/vendor/github.com/segmentio/kafka-go/metadata.go
index 4b1309f85..429a6a260 100644
--- a/vendor/github.com/segmentio/kafka-go/metadata.go
+++ b/vendor/github.com/segmentio/kafka-go/metadata.go
@@ -199,3 +199,89 @@ func (p partitionMetadataV1) writeTo(wb *writeBuffer) {
 	wb.writeInt32Array(p.Replicas)
 	wb.writeInt32Array(p.Isr)
 }
+
+type topicMetadataRequestV6 struct {
+	Topics                 []string
+	AllowAutoTopicCreation bool
+}
+
+func (r topicMetadataRequestV6) size() int32 {
+	return sizeofStringArray([]string(r.Topics)) + 1
+}
+
+func (r topicMetadataRequestV6) writeTo(wb *writeBuffer) {
+	// communicate nil-ness to the broker by passing -1 as the array length.
+	// for this particular request, the broker interpets a zero length array
+	// as a request for no topics whereas a nil array is for all topics.
+	if r.Topics == nil {
+		wb.writeArrayLen(-1)
+	} else {
+		wb.writeStringArray([]string(r.Topics))
+	}
+	wb.writeBool(r.AllowAutoTopicCreation)
+}
+
+type metadataResponseV6 struct {
+	ThrottleTimeMs int32
+	Brokers        []brokerMetadataV1
+	ClusterId      string
+	ControllerID   int32
+	Topics         []topicMetadataV6
+}
+
+func (r metadataResponseV6) size() int32 {
+	n1 := sizeofArray(len(r.Brokers), func(i int) int32 { return r.Brokers[i].size() })
+	n2 := sizeofNullableString(&r.ClusterId)
+	n3 := sizeofArray(len(r.Topics), func(i int) int32 { return r.Topics[i].size() })
+	return 4 + 4 + n1 + n2 + n3
+}
+
+func (r metadataResponseV6) writeTo(wb *writeBuffer) {
+	wb.writeInt32(r.ThrottleTimeMs)
+	wb.writeArray(len(r.Brokers), func(i int) { r.Brokers[i].writeTo(wb) })
+	wb.writeString(r.ClusterId)
+	wb.writeInt32(r.ControllerID)
+	wb.writeArray(len(r.Topics), func(i int) { r.Topics[i].writeTo(wb) })
+}
+
+type topicMetadataV6 struct {
+	TopicErrorCode int16
+	TopicName      string
+	Internal       bool
+	Partitions     []partitionMetadataV6
+}
+
+func (t topicMetadataV6) size() int32 {
+	return 2 + 1 +
+		sizeofString(t.TopicName) +
+		sizeofArray(len(t.Partitions), func(i int) int32 { return t.Partitions[i].size() })
+}
+
+func (t topicMetadataV6) writeTo(wb *writeBuffer) {
+	wb.writeInt16(t.TopicErrorCode)
+	wb.writeString(t.TopicName)
+	wb.writeBool(t.Internal)
+	wb.writeArray(len(t.Partitions), func(i int) { t.Partitions[i].writeTo(wb) })
+}
+
+type partitionMetadataV6 struct {
+	PartitionErrorCode int16
+	PartitionID        int32
+	Leader             int32
+	Replicas           []int32
+	Isr                []int32
+	OfflineReplicas    []int32
+}
+
+func (p partitionMetadataV6) size() int32 {
+	return 2 + 4 + 4 + sizeofInt32Array(p.Replicas) + sizeofInt32Array(p.Isr) + sizeofInt32Array(p.OfflineReplicas)
+}
+
+func (p partitionMetadataV6) writeTo(wb *writeBuffer) {
+	wb.writeInt16(p.PartitionErrorCode)
+	wb.writeInt32(p.PartitionID)
+	wb.writeInt32(p.Leader)
+	wb.writeInt32Array(p.Replicas)
+	wb.writeInt32Array(p.Isr)
+	wb.writeInt32Array(p.OfflineReplicas)
+}
diff --git a/vendor/github.com/segmentio/kafka-go/offsetdelete.go b/vendor/github.com/segmentio/kafka-go/offsetdelete.go
new file mode 100644
index 000000000..ea526eb25
--- /dev/null
+++ b/vendor/github.com/segmentio/kafka-go/offsetdelete.go
@@ -0,0 +1,106 @@
+package kafka
+
+import (
+	"context"
+	"fmt"
+	"net"
+	"time"
+
+	"github.com/segmentio/kafka-go/protocol/offsetdelete"
+)
+
+// OffsetDelete deletes the offset for a consumer group on a particular topic
+// for a particular partition.
+type OffsetDelete struct {
+	Topic     string
+	Partition int
+}
+
+// OffsetDeleteRequest represents a request sent to a kafka broker to delete
+// the offsets for a partition on a given topic associated with a consumer group.
+type OffsetDeleteRequest struct {
+	// Address of the kafka broker to send the request to.
+	Addr net.Addr
+
+	// ID of the consumer group to delete the offsets for.
+	GroupID string
+
+	// Set of topic partitions to delete offsets for.
+	Topics map[string][]int
+}
+
+// OffsetDeleteResponse represents a response from a kafka broker to a delete
+// offset request.
+type OffsetDeleteResponse struct {
+	// An error that may have occurred while attempting to delete an offset
+	Error error
+
+	// The amount of time that the broker throttled the request.
+	Throttle time.Duration
+
+	// Set of topic partitions that the kafka broker has additional info (error?)
+	// for.
+	Topics map[string][]OffsetDeletePartition
+}
+
+// OffsetDeletePartition represents the state of a status of a partition in response
+// to deleting offsets.
+type OffsetDeletePartition struct {
+	// ID of the partition.
+	Partition int
+
+	// An error that may have occurred while attempting to delete an offset for
+	// this partition.
+	Error error
+}
+
+// OffsetDelete sends a delete offset request to a kafka broker and returns the
+// response.
+func (c *Client) OffsetDelete(ctx context.Context, req *OffsetDeleteRequest) (*OffsetDeleteResponse, error) {
+	topics := make([]offsetdelete.RequestTopic, 0, len(req.Topics))
+
+	for topicName, partitionIndexes := range req.Topics {
+		partitions := make([]offsetdelete.RequestPartition, len(partitionIndexes))
+
+		for i, c := range partitionIndexes {
+			partitions[i] = offsetdelete.RequestPartition{
+				PartitionIndex: int32(c),
+			}
+		}
+
+		topics = append(topics, offsetdelete.RequestTopic{
+			Name:       topicName,
+			Partitions: partitions,
+		})
+	}
+
+	m, err := c.roundTrip(ctx, req.Addr, &offsetdelete.Request{
+		GroupID: req.GroupID,
+		Topics:  topics,
+	})
+	if err != nil {
+		return nil, fmt.Errorf("kafka.(*Client).OffsetDelete: %w", err)
+	}
+	r := m.(*offsetdelete.Response)
+
+	res := &OffsetDeleteResponse{
+		Error:    makeError(r.ErrorCode, ""),
+		Throttle: makeDuration(r.ThrottleTimeMs),
+		Topics:   make(map[string][]OffsetDeletePartition, len(r.Topics)),
+	}
+
+	for _, topic := range r.Topics {
+		partitions := make([]OffsetDeletePartition, len(topic.Partitions))
+
+		for i, p := range topic.Partitions {
+			partitions[i] = OffsetDeletePartition{
+				Partition: int(p.PartitionIndex),
+				Error:     makeError(p.ErrorCode, ""),
+			}
+		}
+
+		res.Topics[topic.Name] = partitions
+	}
+
+	return res, nil
+}
diff --git a/vendor/github.com/segmentio/kafka-go/protocol.go b/vendor/github.com/segmentio/kafka-go/protocol.go
index 829fabf54..37208abf1 100644
--- a/vendor/github.com/segmentio/kafka-go/protocol.go
+++ b/vendor/github.com/segmentio/kafka-go/protocol.go
@@ -107,10 +107,11 @@ const (
 	v2  = 2
 	v3  = 3
 	v5  = 5
+	v6  = 6
 	v7  = 7
 	v10 = 10
 
-	// Unused protocol versions: v4, v6, v8, v9.
+	// Unused protocol versions: v4, v8, v9.
 )
 
 var apiKeyStrings = [...]string{
diff --git a/vendor/github.com/segmentio/kafka-go/protocol/offsetdelete/offsetdelete.go b/vendor/github.com/segmentio/kafka-go/protocol/offsetdelete/offsetdelete.go
new file mode 100644
index 000000000..bda619f3c
--- /dev/null
+++ b/vendor/github.com/segmentio/kafka-go/protocol/offsetdelete/offsetdelete.go
@@ -0,0 +1,47 @@
+package offsetdelete
+
+import "github.com/segmentio/kafka-go/protocol"
+
+func init() {
+	protocol.Register(&Request{}, &Response{})
+}
+
+type Request struct {
+	GroupID string         `kafka:"min=v0,max=v0"`
+	Topics  []RequestTopic `kafka:"min=v0,max=v0"`
+}
+
+func (r *Request) ApiKey() protocol.ApiKey { return protocol.OffsetDelete }
+
+func (r *Request) Group() string { return r.GroupID }
+
+type RequestTopic struct {
+	Name       string             `kafka:"min=v0,max=v0"`
+	Partitions []RequestPartition `kafka:"min=v0,max=v0"`
+}
+
+type RequestPartition struct {
+	PartitionIndex int32 `kafka:"min=v0,max=v0"`
+}
+
+var (
+	_ protocol.GroupMessage = (*Request)(nil)
+)
+
+type Response struct {
+	ErrorCode      int16           `kafka:"min=v0,max=v0"`
+	ThrottleTimeMs int32           `kafka:"min=v0,max=v0"`
+	Topics         []ResponseTopic `kafka:"min=v0,max=v0"`
+}
+
+func (r *Response) ApiKey() protocol.ApiKey { return protocol.OffsetDelete }
+
+type ResponseTopic struct {
+	Name       string              `kafka:"min=v0,max=v0"`
+	Partitions []ResponsePartition `kafka:"min=v0,max=v0"`
+}
+
+type ResponsePartition struct {
+	PartitionIndex int32 `kafka:"min=v0,max=v0"`
+	ErrorCode      int16 `kafka:"min=v0,max=v0"`
+}
diff --git a/vendor/github.com/segmentio/kafka-go/reader.go b/vendor/github.com/segmentio/kafka-go/reader.go
index facaf7090..8e46c338f 100644
--- a/vendor/github.com/segmentio/kafka-go/reader.go
+++ b/vendor/github.com/segmentio/kafka-go/reader.go
@@ -277,7 +277,7 @@ func (r *Reader) commitLoop(ctx context.Context, gen *Generation) {
 		l.Printf("stopped commit for group %s\n", r.config.GroupID)
 	})
 
-	if r.config.CommitInterval == 0 {
+	if r.useSyncCommits() {
 		r.commitLoopImmediate(ctx, gen)
 	} else {
 		r.commitLoopInterval(ctx, gen)
@@ -398,6 +398,11 @@ type ReaderConfig struct {
 	// Default: 10s
 	MaxWait time.Duration
 
+	// ReadBatchTimeout amount of time to wait to fetch message from kafka messages batch.
+	//
+	// Default: 10s
+	ReadBatchTimeout time.Duration
+
 	// ReadLagInterval sets the frequency at which the reader lag is updated.
 	// Setting this field to a negative value disables lag reporting.
 	ReadLagInterval time.Duration
@@ -649,6 +654,10 @@ func NewReader(config ReaderConfig) *Reader {
 		config.MaxWait = 10 * time.Second
 	}
 
+	if config.ReadBatchTimeout == 0 {
+		config.ReadBatchTimeout = 10 * time.Second
+	}
+
 	if config.ReadLagInterval == 0 {
 		config.ReadLagInterval = 1 * time.Minute
 	}
@@ -1052,12 +1061,16 @@ func (r *Reader) SetOffsetAt(ctx context.Context, t time.Time) error {
 	}
 	r.mutex.Unlock()
 
+	if len(r.config.Brokers) < 1 {
+		return errors.New("no brokers in config")
+	}
+	var conn *Conn
+	var err error
 	for _, broker := range r.config.Brokers {
-		conn, err := r.config.Dialer.DialLeader(ctx, "tcp", broker, r.config.Topic, r.config.Partition)
+		conn, err = r.config.Dialer.DialLeader(ctx, "tcp", broker, r.config.Topic, r.config.Partition)
 		if err != nil {
 			continue
 		}
-
 		deadline, _ := ctx.Deadline()
 		conn.SetDeadline(deadline)
 		offset, err := conn.ReadOffset(t)
@@ -1068,7 +1081,7 @@ func (r *Reader) SetOffsetAt(ctx context.Context, t time.Time) error {
 
 		return r.SetOffset(offset)
 	}
-	return fmt.Errorf("error setting offset for timestamp %+v", t)
+	return fmt.Errorf("error dialing all brokers, one of the errors: %w", err)
 }
 
 // Stats returns a snapshot of the reader stats since the last time the method
@@ -1181,22 +1194,23 @@ func (r *Reader) start(offsetsByPartition map[topicPartition]int64) {
 			defer join.Done()
 
 			(&reader{
-				dialer:          r.config.Dialer,
-				logger:          r.config.Logger,
-				errorLogger:     r.config.ErrorLogger,
-				brokers:         r.config.Brokers,
-				topic:           key.topic,
-				partition:       int(key.partition),
-				minBytes:        r.config.MinBytes,
-				maxBytes:        r.config.MaxBytes,
-				maxWait:         r.config.MaxWait,
-				backoffDelayMin: r.config.ReadBackoffMin,
-				backoffDelayMax: r.config.ReadBackoffMax,
-				version:         r.version,
-				msgs:            r.msgs,
-				stats:           r.stats,
-				isolationLevel:  r.config.IsolationLevel,
-				maxAttempts:     r.config.MaxAttempts,
+				dialer:           r.config.Dialer,
+				logger:           r.config.Logger,
+				errorLogger:      r.config.ErrorLogger,
+				brokers:          r.config.Brokers,
+				topic:            key.topic,
+				partition:        int(key.partition),
+				minBytes:         r.config.MinBytes,
+				maxBytes:         r.config.MaxBytes,
+				maxWait:          r.config.MaxWait,
+				readBatchTimeout: r.config.ReadBatchTimeout,
+				backoffDelayMin:  r.config.ReadBackoffMin,
+				backoffDelayMax:  r.config.ReadBackoffMax,
+				version:          r.version,
+				msgs:             r.msgs,
+				stats:            r.stats,
+				isolationLevel:   r.config.IsolationLevel,
+				maxAttempts:      r.config.MaxAttempts,
 
 				// backwards-compatibility flags
 				offsetOutOfRangeError: r.config.OffsetOutOfRangeError,
@@ -1209,22 +1223,23 @@ func (r *Reader) start(offsetsByPartition map[topicPartition]int64) {
 // used as an way to asynchronously fetch messages while the main program reads
 // them using the high level reader API.
 type reader struct {
-	dialer          *Dialer
-	logger          Logger
-	errorLogger     Logger
-	brokers         []string
-	topic           string
-	partition       int
-	minBytes        int
-	maxBytes        int
-	maxWait         time.Duration
-	backoffDelayMin time.Duration
-	backoffDelayMax time.Duration
-	version         int64
-	msgs            chan<- readerMessage
-	stats           *readerStats
-	isolationLevel  IsolationLevel
-	maxAttempts     int
+	dialer           *Dialer
+	logger           Logger
+	errorLogger      Logger
+	brokers          []string
+	topic            string
+	partition        int
+	minBytes         int
+	maxBytes         int
+	maxWait          time.Duration
+	readBatchTimeout time.Duration
+	backoffDelayMin  time.Duration
+	backoffDelayMax  time.Duration
+	version          int64
+	msgs             chan<- readerMessage
+	stats            *readerStats
+	isolationLevel   IsolationLevel
+	maxAttempts      int
 
 	offsetOutOfRangeError bool
 }
@@ -1492,15 +1507,8 @@ func (r *reader) read(ctx context.Context, offset int64, conn *Conn) (int64, err
 	var size int64
 	var bytes int64
 
-	const safetyTimeout = 10 * time.Second
-	deadline := time.Now().Add(safetyTimeout)
-	conn.SetReadDeadline(deadline)
-
 	for {
-		if now := time.Now(); deadline.Sub(now) < (safetyTimeout / 2) {
-			deadline = now.Add(safetyTimeout)
-			conn.SetReadDeadline(deadline)
-		}
+		conn.SetReadDeadline(time.Now().Add(r.readBatchTimeout))
 
 		if msg, err = batch.ReadMessage(); err != nil {
 			batch.Close()
diff --git a/vendor/github.com/segmentio/kafka-go/writer.go b/vendor/github.com/segmentio/kafka-go/writer.go
index 8d48e95cd..dc9c77f78 100644
--- a/vendor/github.com/segmentio/kafka-go/writer.go
+++ b/vendor/github.com/segmentio/kafka-go/writer.go
@@ -100,6 +100,18 @@ type Writer struct {
 	// The default is to try at most 10 times.
 	MaxAttempts int
 
+	// WriteBackoffMin optionally sets the smallest amount of time the writer waits before
+	// it attempts to write a batch of messages
+	//
+	// Default: 100ms
+	WriteBackoffMin time.Duration
+
+	// WriteBackoffMax optionally sets the maximum amount of time the writer waits before
+	// it attempts to write a batch of messages
+	//
+	// Default: 1s
+	WriteBackoffMax time.Duration
+
 	// Limit on how many messages will be buffered before being sent to a
 	// partition.
 	//
@@ -343,17 +355,19 @@ type WriterStats struct {
 	BatchTime  DurationStats `metric:"kafka.writer.batch.seconds"`
 	WriteTime  DurationStats `metric:"kafka.writer.write.seconds"`
 	WaitTime   DurationStats `metric:"kafka.writer.wait.seconds"`
-	Retries    SummaryStats  `metric:"kafka.writer.retries.count"`
+	Retries    int64         `metric:"kafka.writer.retries.count" type:"counter"`
 	BatchSize  SummaryStats  `metric:"kafka.writer.batch.size"`
 	BatchBytes SummaryStats  `metric:"kafka.writer.batch.bytes"`
 
-	MaxAttempts  int64         `metric:"kafka.writer.attempts.max"  type:"gauge"`
-	MaxBatchSize int64         `metric:"kafka.writer.batch.max"     type:"gauge"`
-	BatchTimeout time.Duration `metric:"kafka.writer.batch.timeout" type:"gauge"`
-	ReadTimeout  time.Duration `metric:"kafka.writer.read.timeout"  type:"gauge"`
-	WriteTimeout time.Duration `metric:"kafka.writer.write.timeout" type:"gauge"`
-	RequiredAcks int64         `metric:"kafka.writer.acks.required" type:"gauge"`
-	Async        bool          `metric:"kafka.writer.async"         type:"gauge"`
+	MaxAttempts     int64         `metric:"kafka.writer.attempts.max"  type:"gauge"`
+	WriteBackoffMin time.Duration `metric:"kafka.writer.backoff.min"   type:"gauge"`
+	WriteBackoffMax time.Duration `metric:"kafka.writer.backoff.max"   type:"gauge"`
+	MaxBatchSize    int64         `metric:"kafka.writer.batch.max"     type:"gauge"`
+	BatchTimeout    time.Duration `metric:"kafka.writer.batch.timeout" type:"gauge"`
+	ReadTimeout     time.Duration `metric:"kafka.writer.read.timeout"  type:"gauge"`
+	WriteTimeout    time.Duration `metric:"kafka.writer.write.timeout" type:"gauge"`
+	RequiredAcks    int64         `metric:"kafka.writer.acks.required" type:"gauge"`
+	Async           bool          `metric:"kafka.writer.async"         type:"gauge"`
 
 	Topic string `tag:"topic"`
 
@@ -390,7 +404,7 @@ type writerStats struct {
 	batchTime      summary
 	writeTime      summary
 	waitTime       summary
-	retries        summary
+	retries        counter
 	batchSize      summary
 	batchSizeBytes summary
 }
@@ -779,6 +793,20 @@ func (w *Writer) maxAttempts() int {
 	return 10
 }
 
+func (w *Writer) writeBackoffMin() time.Duration {
+	if w.WriteBackoffMin > 0 {
+		return w.WriteBackoffMin
+	}
+	return 100 * time.Millisecond
+}
+
+func (w *Writer) writeBackoffMax() time.Duration {
+	if w.WriteBackoffMax > 0 {
+		return w.WriteBackoffMax
+	}
+	return 1 * time.Second
+}
+
 func (w *Writer) batchSize() int {
 	if w.BatchSize > 0 {
 		return w.BatchSize
@@ -849,26 +877,28 @@ func (w *Writer) stats() *writerStats {
 func (w *Writer) Stats() WriterStats {
 	stats := w.stats()
 	return WriterStats{
-		Dials:        stats.dials.snapshot(),
-		Writes:       stats.writes.snapshot(),
-		Messages:     stats.messages.snapshot(),
-		Bytes:        stats.bytes.snapshot(),
-		Errors:       stats.errors.snapshot(),
-		DialTime:     stats.dialTime.snapshotDuration(),
-		BatchTime:    stats.batchTime.snapshotDuration(),
-		WriteTime:    stats.writeTime.snapshotDuration(),
-		WaitTime:     stats.waitTime.snapshotDuration(),
-		Retries:      stats.retries.snapshot(),
-		BatchSize:    stats.batchSize.snapshot(),
-		BatchBytes:   stats.batchSizeBytes.snapshot(),
-		MaxAttempts:  int64(w.MaxAttempts),
-		MaxBatchSize: int64(w.BatchSize),
-		BatchTimeout: w.BatchTimeout,
-		ReadTimeout:  w.ReadTimeout,
-		WriteTimeout: w.WriteTimeout,
-		RequiredAcks: int64(w.RequiredAcks),
-		Async:        w.Async,
-		Topic:        w.Topic,
+		Dials:           stats.dials.snapshot(),
+		Writes:          stats.writes.snapshot(),
+		Messages:        stats.messages.snapshot(),
+		Bytes:           stats.bytes.snapshot(),
+		Errors:          stats.errors.snapshot(),
+		DialTime:        stats.dialTime.snapshotDuration(),
+		BatchTime:       stats.batchTime.snapshotDuration(),
+		WriteTime:       stats.writeTime.snapshotDuration(),
+		WaitTime:        stats.waitTime.snapshotDuration(),
+		Retries:         stats.retries.snapshot(),
+		BatchSize:       stats.batchSize.snapshot(),
+		BatchBytes:      stats.batchSizeBytes.snapshot(),
+		MaxAttempts:     int64(w.MaxAttempts),
+		WriteBackoffMin: w.WriteBackoffMin,
+		WriteBackoffMax: w.WriteBackoffMax,
+		MaxBatchSize:    int64(w.BatchSize),
+		BatchTimeout:    w.BatchTimeout,
+		ReadTimeout:     w.ReadTimeout,
+		WriteTimeout:    w.WriteTimeout,
+		RequiredAcks:    int64(w.RequiredAcks),
+		Async:           w.Async,
+		Topic:           w.Topic,
 	}
 }
 
@@ -1086,7 +1116,7 @@ func (ptw *partitionWriter) writeBatch(batch *writeBatch) {
 			//   guarantees to abort, but may be better to avoid long wait times
 			//   on close.
 			//
-			delay := backoff(attempt, 100*time.Millisecond, 1*time.Second)
+			delay := backoff(attempt, ptw.w.writeBackoffMin(), ptw.w.writeBackoffMax())
 			ptw.w.withLogger(func(log Logger) {
 				log.Printf("backing off %s writing %d messages to %s (partition: %d)", delay, len(batch.msgs), key.topic, key.partition)
 			})
diff --git a/vendor/modules.txt b/vendor/modules.txt
index 2d7cba3ed..cd533f1f3 100644
--- a/vendor/modules.txt
+++ b/vendor/modules.txt
@@ -120,10 +120,12 @@ github.com/jpillora/backoff
 # github.com/json-iterator/go v1.1.12
 ## explicit; go 1.12
 github.com/json-iterator/go
-# github.com/klauspost/compress v1.15.7
+# github.com/klauspost/compress v1.15.9
 ## explicit; go 1.16
 github.com/klauspost/compress
+github.com/klauspost/compress/flate
 github.com/klauspost/compress/fse
+github.com/klauspost/compress/gzip
 github.com/klauspost/compress/huff0
 github.com/klauspost/compress/internal/cpuinfo
 github.com/klauspost/compress/internal/snapref
@@ -198,8 +200,6 @@ github.com/netsampler/goflow2/utils
 # github.com/pelletier/go-toml v1.9.4
 ## explicit; go 1.12
 github.com/pelletier/go-toml
-# github.com/pierrec/lz4 v2.6.1+incompatible
-## explicit
 # github.com/pierrec/lz4/v4 v4.1.15
 ## explicit; go 1.14
 github.com/pierrec/lz4/v4
@@ -245,7 +245,7 @@ github.com/prometheus/prometheus/tsdb/errors
 github.com/prometheus/prometheus/tsdb/fileutil
 github.com/prometheus/prometheus/tsdb/tsdbutil
 github.com/prometheus/prometheus/util/strutil
-# github.com/segmentio/kafka-go v0.4.35
+# github.com/segmentio/kafka-go v0.4.38
 ## explicit; go 1.15
 github.com/segmentio/kafka-go
 github.com/segmentio/kafka-go/compress
@@ -279,6 +279,7 @@ github.com/segmentio/kafka-go/protocol/listgroups
 github.com/segmentio/kafka-go/protocol/listoffsets
 github.com/segmentio/kafka-go/protocol/metadata
 github.com/segmentio/kafka-go/protocol/offsetcommit
+github.com/segmentio/kafka-go/protocol/offsetdelete
 github.com/segmentio/kafka-go/protocol/offsetfetch
 github.com/segmentio/kafka-go/protocol/produce
 github.com/segmentio/kafka-go/protocol/saslauthenticate