Starts stalling under load #13

AudriusButkevicius · 2016-05-26T22:24:12Z

Using this benchmark:

import (
    "net"
    "testing"

    "github.com/anacrolix/utp"
)

func getTCPConnectionPair() (net.Conn, net.Conn, error) {
    lst, err := net.Listen("tcp", "127.0.0.1:0")
    if err != nil {
        return nil, nil, err
    }

    var conn0 net.Conn
    var err0 error
    done := make(chan struct{})
    go func() {
        conn0, err0 = lst.Accept()
        close(done)
    }()

    conn1, err := net.Dial("tcp", lst.Addr().String())
    if err != nil {
        return nil, nil, err
    }

    <-done
    if err0 != nil {
        return nil, nil, err0
    }
    return conn0, conn1, nil
}

func getUTPConnectionPair() (net.Conn, net.Conn, error) {
    lst, err := utp.NewSocket("udp", "127.0.0.1:0")
    if err != nil {
        return nil, nil, err
    }

    var conn0 net.Conn
    var err0 error
    done := make(chan struct{})
    go func() {
        conn0, err0 = lst.Accept()
        close(done)
    }()

    conn1, err := utp.Dial(lst.Addr().String())
    if err != nil {
        return nil, nil, err
    }

    <-done
    if err0 != nil {
        return nil, nil, err0
    }

    return conn0, conn1, nil
}


func benchConnPair(b *testing.B, c0, c1 net.Conn) {
    b.ReportAllocs()
    b.SetBytes(128 << 10)
    b.ResetTimer()

    request := make([]byte, 52)
    response := make([]byte, (128<<10)+8)

    pair := []net.Conn{c0, c1}
    for i := 0; i < b.N; i++ {
        if i%2 == 0 {
            pair[0] = c0
            pair[1] = c1
        } else {
            pair[0] = c1
            pair[1] = c0
        }

        if _, err := pair[0].Write(request); err != nil {
            b.Fatal(err)
        }

        if _, err := pair[1].Read(request[:8]); err != nil {
            b.Fatal(err)
        }
        if _, err := pair[1].Read(request[8:]); err != nil {
            b.Fatal(err)
        }
        if _, err := pair[1].Write(response); err != nil {
            b.Fatal(err)
        }
        if _, err := pair[0].Read(response[:8]); err != nil {
            b.Fatal(err)
        }
        if _, err := pair[0].Read(response[8:]); err != nil {
            b.Fatal(err)
        }
    }
}

func BenchmarkTCP(b *testing.B) {
    conn0, conn1, err := getTCPConnectionPair()
    if err != nil {
        b.Fatal(err)
    }

    defer conn0.Close()
    defer conn1.Close()

    benchConnPair(b, conn0, conn1)
}

func BenchmarkUTP(b *testing.B) {
    conn0, conn1, err := getUTPConnectionPair()
    if err != nil {
        b.Fatal(err)
    }

    defer conn0.Close()
    defer conn1.Close()

    benchConnPair(b, conn0, conn1)
}

I get the following:

PASS
BenchmarkTCP-4      5000        263246 ns/op     497.91 MB/s          27 B/op          0 allocs/op
BenchmarkUTP-4  2016/05/26 23:21:51 error resending packet: write udp [::]:56345->127.0.0.1:56344: use of closed network connection
2016/05/26 23:21:51 error resending packet: write udp [::]:56345->127.0.0.1:56344: use of closed network connection
2016/05/26 23:21:51 error resending packet: write udp [::]:56345->127.0.0.1:56344: use of closed network connection
     100     134251556 ns/op       0.98 MB/s     1195857 B/op       4016 allocs/op

If you tweak the numbers a bit, you might get it to 0.2MB/s

I did manage to get it to a point where it just mostly never completes and fails with "connection closed", but that was using syncthings protocol wrappers which do async read/writes. In cases where it did complete it was around 0.3MB/s

The text was updated successfully, but these errors were encountered:

anacrolix · 2016-05-27T03:05:58Z

Is this a new trend, or has this benchmark been used on eagle earlier revisions of uTP? This looks like a useful benchmark, I'll do some work with it.

anacrolix · 2016-05-27T04:44:24Z

I've done some investigating. I've found similar issues with these kinds of tests before. Essentially people forget that UDP is not guaranteed delivery, even over localhost. After reaching a certain level of UDP packet throughput (10k/s roughly on mac?), packets start getting dropped for long periods of time. I've added the "inproc" test, which bypasses UDP and gives you a more theoretical limit to performance. I recommend you create more realistic, real-world tests, for example transfers over links with 1-2ms latency, which would a more lower bound for latency between machines.

If you want to continue chasing performance on localhost, I believe there are a few options:

Improve the latency calculation, and retransmission code so that retries occur sooner.
Rate limit UDP packets, so that they're slowed down before the system starts dropping them. I don't currently do this, except externally, in a few tests.

Let me know your thoughts, I'll do some more experimenting with this in the meanwhile.

AudriusButkevicius · 2016-05-27T06:15:42Z

So under normal circumstances and a real world scenario it seems to do just fine. I added this synthetic benchmark as we have it for tcp. I haven't tested it previous revisions, just head, so I can't claim that it's a regression. I guess I'll add sleep to the benchmarks and see if I can get it to show more realistic numbers, but I wonder if the package should handle situations like these, give tcp.works well even under sythetic conditions?

anacrolix · 2016-05-27T06:35:27Z

This is what I get on my machine:

$ go test -bench Syncthing -run @ github.com/anacrolix/utp
BenchmarkSyncthingTCP-4            10000        130547 ns/op    1004.01 MB/s          13 B/op          0 allocs/op
BenchmarkSyncthingUDPUTP-4           100      61124441 ns/op       2.14 MB/s     1350979 B/op       4229 allocs/op
BenchmarkSyncthingInprocUTP-4        500       2304482 ns/op      56.88 MB/s     1706268 B/op       4040 allocs/op

AudriusButkevicius · 2016-05-27T18:21:00Z

So even if I add a 10ms delay between each iteration of the benchmark, I still get a splew of 2016/05/27 19:18:27 error resending packet: write udp [::]:60754->127.0.0.1:60753: use of closed network connection and the rate is still fairly horrid, nowhere close the 56MB you get over a pipe pair.

AudriusButkevicius · 2016-05-27T18:30:05Z

I agree that the implementation is capable of achieving 56mb under ideal lossless conditions, but it seems that it really struggles when where is some packet loss?

anacrolix · 2016-05-28T04:41:52Z

Open to improvements!

anacrolix · 2016-05-28T06:21:23Z

I concur, it should be faster. What command are you running when you get those log messages?

AudriusButkevicius · 2016-05-28T10:13:48Z

I am running the same command as you are in your example, yet I am on Windows, so perhaps the semantics are different.

To be honest, I am struggling to repro this now.

I've dug deeper into this, and usually the stalling happens when we need to recv the 128kb buffer, but it properly dies when we get timed out waiting for ack, which is not always but fairly often, even on a local connection.

anacrolix · 2016-05-28T11:36:45Z

Hm that's interesting. The network stack does perform very differently between platforms when you push it. If you get timed out waiting for ack, that's utp.Conn giving up completely. You shouldn't see this in tests unless your system isn't coping. I suspect in your syncthing tests, this is when so many retries have happened, and still not got through that it times out. The other log message, use of closed network connection, I should be suppressing.

AudriusButkevicius · 2016-05-28T11:53:44Z

So I think that pushing it is an overstatement, given it barely reaches 1MB/s. Also number of Write() calls that we produce is in the hundreds over 10s benchmark, so I don't think we're hitting a 10k packets a second problem here either (unless it's mostly UTP overhead)

I guess the other thing I could do is "record" the payloads I am sending, and at the point it hits the problem, try replaying it see if I can repro it?

Is there any meaningful logging I could enable which would tell me about the state of the UTC connection? (window sizes etc etc)

anacrolix · 2016-05-28T12:01:56Z

utp breaks up your Conn.Write calls into chunks of no more than 1202 bytes each. Those have to be retransmitted on failure, and acknowledged at intervals by the receiver.

There are several very important variables in a const block at the top of utp.go. There's also testing timeouts set by setDefaultTestingDurations for going over localhost.

envpprof exposes status information over http on localhost for some tests, although it doesn't go to the individual Conn currently. That would be very easy to add. See status.go. It's exposed by passing GOPPROF=http as an environment variable, the status is printed to the index handler at /, check the log while running tests for the port. Also see /debug/vars, where all the expvar variables are exported. Those are also listed at the top of utp.go.

anacrolix · 2016-05-29T04:35:22Z

Well turns out the test was bad. I went to implement a purely UDP version to see if the UDP overhead is what was killing it. Then I discovered the reads and writes are synchronous, and mutually exclusive. I've fixed this, and here are the new benchmarks on my machine:

$ GOPPROF=http go test -bench Syncthing -run @ github.com/anacrolix/utp
2016/05/29 14:30:03 envpprof serving http://127.0.0.1:6061
PASS
BenchmarkSyncthingTCP-4 20000 89890 ns/op 1458.80 MB/s 112 B/op 13 allocs/op
BenchmarkSyncthingUDPUTP-4 500 2419213 ns/op 54.20 MB/s 1032166 B/op 2027 allocs/op
BenchmarkSyncthingInprocUTP-4 1000 1185226 ns/op 110.64 MB/s 1247499 B/op 1938 allocs/op

Much nicer. Also this would explain why all my attempted performance tweaks seemed to have no impact: The first ~64KB was written in each test, and the rest would have been struggled out through incessant resends, as the uTP window isn't as high as the TCP one on my system. I would guess that the TCP window is lower on your Windows machine too, which would explain your occasional timeouts. If the Write stalls because the window is full, it'll never get to the read, if you do a 128KB synchronous read like it was.

AudriusButkevicius · 2016-05-29T09:33:03Z

I am not sure I understand why would synchronous mutually exclusive reads and writes cause problems?

So I understand that you changed the test to fix this, but it's a bit of a mystery to me why the same code works with TCP without the fix?

It makes me think that I have to write the application in a specific way in order to use UTP and not get stalling?

anacrolix · 2016-05-30T05:35:17Z

Change your original test to use io.ReadFull in place of your Read calls. Read is returning early, and not all the bytes. It doesn't even need to be concurrently read and written. I've tried this per your originally provided test, and it also corrects the issue. I won't add it to my standard suite of tests, as it only coincidentally works due to sufficient window sizes.

func requireReadFull(tb testing.TB, r io.Reader, b []byte) {
    n, err := io.ReadFull(r, b)
    require.NoError(tb, err)
    require.EqualValues(tb, len(b), n)
}

The windows are relevant, but aren't the issue in this case. https://en.wikipedia.org/wiki/TCP_tuning#Window_size, and read the spec on io.Writer and io.Reader for more.

anacrolix · 2016-05-30T05:39:53Z

Here's the patch that fixes the synchronous version.

14.txt

whyrusleeping · 2016-06-01T20:56:15Z

Glad to see this got reported and fixed!! I've been noticing the same issue when stressing utp connections with ipfs

AudriusButkevicius · 2016-06-01T20:59:50Z

It was a fix to the test not the code.
Sadly when actually trying to use it, it still stalls/disconnects with timed out waiting for ack

whyrusleeping · 2016-06-01T21:00:14Z

@AudriusButkevicius ah, dang.

AudriusButkevicius · 2016-06-01T21:00:46Z

So I know this library is used in the torrent library, so I'd assume that it has had some exposure and usage?

whyrusleeping · 2016-06-01T21:02:44Z

Yeah, its pretty stable overall. In our usage though there are some occasional misbehaviours that i havent been able to track down too closely yet. I'll update our code to use the latest version of this lib soon and stress things again

AudriusButkevicius · 2016-06-01T21:09:54Z

So for me, on a fairly good line it's not bad, but over ADSL in some rural area, it disconnects quite fast.

whyrusleeping · 2016-06-01T21:16:02Z

@AudriusButkevicius if you have some free time, do you think you could try reproducing those failures in a test? For ipfs i've done some work with docker nodes and tc to simulate slow and lossy networks. Most of my work towards that is here and here. Having some semi-automated docker test to repro the problems would be a great start.

anacrolix · 2016-06-02T04:15:15Z

This package is used in production 24/7 on linux/amd64, and gets a lot of testing on osx. There's another issue that's come to my attention when running utp on Windows. If you guys are getting these stalls on Windows, it's probably related, if not the actual problem. anacrolix/torrent#83. I've pushed some workarounds to that in the meanwhile. d7ad5af

anacrolix · 2016-06-02T04:16:16Z

This issue is about the stalls, not the broken test, I shouldn't have closed it.

whyrusleeping · 2016-06-15T19:10:46Z

So, i'm still getting stalls on linux machines. Nothing is hanging, messages still appear to be coming in and processed by the utp read loop (no select 15 minutes type messages in the stack dumps). But my code that is waiting to read on utp sockets is hanging for hours.

Here is an example (large) stack trace: https://ipfs.io/ipfs/QmSLTNdSNWSus9a83ToR2WRho9HeCMSTT7KTbEqx4vyqDo

The yamux reads that are stuck for two or more hours are waiting on data from utp

anacrolix · 2016-06-16T02:14:29Z

If the remote end of the conn has closed without a FIN getting through, and you never attempt to send to them, you will never Read anything again. There's a timeout for writes, but no read keepalive. The easiest solution would be to ensure you have an application level keep alive in your protocol that is sitting on top of utp. This is how BitTorrent does it (2 mins), which would be why I've never encountered this. If you're aware of a keep alive (probably a state packet every so often) in the utp spec I'll implement that.

stunndard · 2016-06-25T13:59:11Z

I can confirm that UDP connections stall and after 3-5 minutes of launching torrent, it keeps downloading only using TCP. Almost nothing gets downloaded using UDP after some time. New UDP connections keep created and totally only a small amount of traffic gets received (300-400kb) per UDP connection until the torrent is finished. TCP connections at the same time receive 10-50MB each before the torrent is finished. When the torrent is started, UDP connections receive traffic with the same speed as the TCP ones, but after some point all UDP connections stall. I suspect it only happens when no other LAN computers download at the same time using the same internet connection, i.e. when the internet connection is monopolized by anacrolix/torrent.

anacrolix · 2016-06-26T01:45:24Z

@stunndard Are you behind a NAT router for ADSL or some such?

AudriusButkevicius · 2016-06-28T08:44:42Z

Why does ADSL matter?

anacrolix · 2016-06-28T16:16:21Z

ADSL implies less reliable connection, specifically packet loss, and likely use of NAT. Typical NAT routers aren't good with 100s of simultaneous connections, and UDP in particular due to unreliable rewriting of addresses. My production servers using uTP aren't behind any NAT so testing there may not be thorough. It's not uncommon for mainstream torrent clients to flat out fail to do uTP reliably in these circumstances.

AudriusButkevicius · 2016-06-28T16:24:03Z

Sorry, but I was under the impression that main point of UTP is to punch through nats, whatever the conditions.

I used torrents behind crappy ADSL nat, which I assume used UTP and never had an issue

anacrolix · 2016-06-28T16:51:47Z

I only implement uTP because it's in major use for BitTorrent, and having it available increases the number of peers available. I don't think uTP is worth all the attention it gets. It's like the MongoDB of network protocols.

stunndard · 2016-06-29T08:51:36Z

I'm behind my router NAT with the torrent port forwarded. However, there are no problems ever if I download with uTorrent, moreover, uTorrent downloads most of the data using uTP, and it is very fast. When monitoring uTorrent, I can see that the most of download connections are UDP (uTP), not TCP. But your torrent after 3-5 minutes stops using UDP at all and the rest is downloaded using TCP only. I don't agree uTP is not important, it was designed to reduce load on the network equipment and be more adaptive when co-using network connections with other applications.

anacrolix · 2016-11-01T08:57:08Z

I suspect UDP packets are being dropped due to your bandwidth being maxed out, and anacrolix/utp is not recovering too well. In particular, sending state packets after a period of not hearing from the peer will likely help here.

anacrolix · 2017-04-10T12:16:23Z

Is there any update on this?

AudriusButkevicius · 2017-04-11T08:45:37Z

Well, I should take it for another spin, but as far as I tested previously it just chokes at some point and does not recover.

anacrolix · 2017-04-11T10:05:52Z

That seems strange, I use it in production for months at a time with 100s of connections simultaneously. I would recommend you ensure you have Read and Write timeouts set.

AudriusButkevicius · 2017-04-11T10:32:34Z

I guess my use was prior #83 in torrent package was addressed, so I should take it for another spin, yet the fact that there is no congestion control could be the cause of this.

anacrolix · 2017-06-11T06:03:12Z

Take a look at https://github.com/anacrolix/go-libutp.

AudriusButkevicius · 2017-06-11T10:30:01Z

This would kill our ability to support various exotic platforms due to cgo :(

mh-cbon · 2017-09-10T00:29:25Z

I have tested to switch to go-libutp rather than utp, and its much better, its awesome :D

anacrolix · 2017-09-10T03:59:53Z

Cheers, I've been using it without problems since anacrolix/go-libutp@750705a.

anacrolix added a commit that referenced this issue May 27, 2016

Add provided benchmark tests from #13

a705995

anacrolix closed this as completed May 30, 2016

anacrolix reopened this Jun 2, 2016

pmorch mentioned this issue Aug 1, 2017

Setting the UDP source port (adding laddr parameter to DialWithOptions in sess.go?) xtaci/kcp-go#51

Closed

anacrolix added the help wanted label Feb 1, 2018

Starts stalling under load #13

Starts stalling under load #13

Comments

AudriusButkevicius commented May 26, 2016

anacrolix commented May 27, 2016

anacrolix commented May 27, 2016

AudriusButkevicius commented May 27, 2016 • edited Loading

anacrolix commented May 27, 2016

AudriusButkevicius commented May 27, 2016

AudriusButkevicius commented May 27, 2016

anacrolix commented May 28, 2016

anacrolix commented May 28, 2016

AudriusButkevicius commented May 28, 2016 • edited Loading

anacrolix commented May 28, 2016

AudriusButkevicius commented May 28, 2016 • edited Loading

anacrolix commented May 28, 2016

anacrolix commented May 29, 2016

AudriusButkevicius commented May 29, 2016 • edited Loading

anacrolix commented May 30, 2016

anacrolix commented May 30, 2016

whyrusleeping commented Jun 1, 2016

AudriusButkevicius commented Jun 1, 2016

whyrusleeping commented Jun 1, 2016

AudriusButkevicius commented Jun 1, 2016

whyrusleeping commented Jun 1, 2016

AudriusButkevicius commented Jun 1, 2016

whyrusleeping commented Jun 1, 2016

anacrolix commented Jun 2, 2016

anacrolix commented Jun 2, 2016

whyrusleeping commented Jun 15, 2016

anacrolix commented Jun 16, 2016

stunndard commented Jun 25, 2016 • edited Loading

anacrolix commented Jun 26, 2016

AudriusButkevicius commented Jun 28, 2016

anacrolix commented Jun 28, 2016

AudriusButkevicius commented Jun 28, 2016

anacrolix commented Jun 28, 2016

stunndard commented Jun 29, 2016

anacrolix commented Nov 1, 2016

anacrolix commented Apr 10, 2017

AudriusButkevicius commented Apr 11, 2017

anacrolix commented Apr 11, 2017

AudriusButkevicius commented Apr 11, 2017

anacrolix commented Jun 11, 2017

AudriusButkevicius commented Jun 11, 2017

mh-cbon commented Sep 10, 2017

anacrolix commented Sep 10, 2017

AudriusButkevicius commented May 27, 2016 •

edited

Loading

AudriusButkevicius commented May 28, 2016 •

edited

Loading

AudriusButkevicius commented May 28, 2016 •

edited

Loading

AudriusButkevicius commented May 29, 2016 •

edited

Loading

stunndard commented Jun 25, 2016 •

edited

Loading