diff --git a/lib/ljsyscall/.dockerignore b/lib/ljsyscall/.dockerignore new file mode 100644 index 0000000000..fd1d8943f6 --- /dev/null +++ b/lib/ljsyscall/.dockerignore @@ -0,0 +1,12 @@ +.* +*.md +COPYRIGHT +ChangeLog +Dockerfile +INSTALL +doc +*.yml +examples +include +rockspec +test diff --git a/lib/ljsyscall/.gitignore b/lib/ljsyscall/.gitignore index ea3ef8819f..a6e4c5785e 100644 --- a/lib/ljsyscall/.gitignore +++ b/lib/ljsyscall/.gitignore @@ -4,3 +4,12 @@ tmp/* *.core ktrace.out obj/* + +/5.1-ljsyscall +/debian/debhelper-build-stamp +/debian/files +/debian/lua-ljsyscall* +/debian/lua_versions +/debian/tmp +/debian/trash + diff --git a/lib/ljsyscall/.travis.yml b/lib/ljsyscall/.travis.yml index 6c90dc6e7c..e474901663 100644 --- a/lib/ljsyscall/.travis.yml +++ b/lib/ljsyscall/.travis.yml @@ -1,11 +1,15 @@ language: c +sudo: required +dist: trusty + +addons: + apt: + packages: + - luajit + - luarocks + - strace before_install: - - sudo add-apt-repository ppa:mwild1/ppa -y - - sudo apt-get update -y - - sudo apt-get install luajit -y --force-yes - - sudo apt-get install luarocks -y - - sudo apt-get install strace -y - git submodule update --init --recursive env: diff --git a/lib/ljsyscall/COPYRIGHT b/lib/ljsyscall/COPYRIGHT index 9f187c3513..2f9256587c 100644 --- a/lib/ljsyscall/COPYRIGHT +++ b/lib/ljsyscall/COPYRIGHT @@ -6,7 +6,7 @@ Files under the include directory include their own copyright information. ljsyscall: System call interface for LuaJIT -Copyright (C) 2011-2014 Justin Cormack. All rights reserved. +Copyright (C) 2011-2016 Justin Cormack. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/lib/ljsyscall/ChangeLog b/lib/ljsyscall/ChangeLog index 0442771dfe..c56e549bb8 100644 --- a/lib/ljsyscall/ChangeLog +++ b/lib/ljsyscall/ChangeLog @@ -1,10 +1,19 @@ - unreleased + 0.12 release + ++ Fix seccomp on arm64 ++ Linux added support for eBPF ++ bug fixes + + 0.11 release + + OSX time functions + OSX Mach types ++ OSX fixes for Yosemite + arm64 support -+ OpenBSD 5.6 and 5.7 support ++ OpenBSD 5.6, 5.7 and 5.8 support + ppc64le support, by Gustavo Serra Scalet + mipsel support ++ added Dockerfile, now available on Docker Hub 0.10 release diff --git a/lib/ljsyscall/Dockerfile b/lib/ljsyscall/Dockerfile new file mode 100644 index 0000000000..50bfcdfd9b --- /dev/null +++ b/lib/ljsyscall/Dockerfile @@ -0,0 +1,4 @@ +FROM alpine:3.4 +RUN apk update && apk add luajit luajit-dev strace && mkdir -p /usr/share/lua/5.1 +COPY . /usr/share/lua/5.1/ +ENTRYPOINT ["luajit"] diff --git a/lib/ljsyscall/README.md b/lib/ljsyscall/README.md index a96203a40d..bcb016820e 100644 --- a/lib/ljsyscall/README.md +++ b/lib/ljsyscall/README.md @@ -16,13 +16,15 @@ The [video of my FOSDEM 2013 talk](http://www.myriabit.com/ljsyscall/) here, and ## Install +A Docker hub automated build (currently only for Linux) is available via `docker pull justincormack/ljsyscall`. You can run the test suite with `docker run justincormack/ljsyscall test/test.lua`, use in a scripted way eg `docker run justincormack/ljsyscall -e "print(require('syscall').nl.interfaces())"` or get an interactive session with `docker -it run justincormack/ljsyscall`. + The stable release is now available in the luarocks repository, so you should be able to run ```luarocks install ljsyscall```. There will be a ```ljsyscall-rump``` rock soon, but I need to fix the install for the rump libraries. For simple uses, you just need to put the ```.lua``` files somewhere that LuaJIT will find them, eg typically in ```/usr/local/share/lua/5.1/```. Keep the directory structure there is. You can safely remove files from architectures and operating systems you do not use. You can also install the head version using luarocks: ```luarocks install rockspec/ljsyscall-scm-1.rockspec``` . -It is also available as a package in [buildroot](http://buildroot.uclibc.org/), a build system for embedded systems, and in [pkgsrc](http://www.pkgsrc.org] the portable packaging system for many systems. +It is also available as a package in [buildroot](http://buildroot.uclibc.org/), a build system for embedded systems, and in [pkgsrc](http://www.pkgsrc.org] the portable packaging system for many systems. It is now packaged for [Alpine Linux](http://www.alpinelinux.org/), in the testing repository. If you are using Lua rather than LuaJIT you need to install [luaffi](https://github.com/jmckaskill/luaffi) first; this is largely working now, but there will be more support for standard Lua coming soon. @@ -66,6 +68,7 @@ This project is being used in a variety of places, such as for testing the Linux * [buildroot](http://buildroot.uclibc.org/) has an ljsyscall package. * [luatz](https://github.com/daurnimator/luatz) uses ljsyscall when available * [Snabb switch](https://github.com/SnabbCo/snabbswitch) a high performance networking toolkit. +* [Spook](https://github.com/johnae/spook) started out as an fs events based test runner similar to Rubys guard but grew into an event toolkit of sorts. ## Testing diff --git a/lib/ljsyscall/debian/changelog b/lib/ljsyscall/debian/changelog new file mode 100644 index 0000000000..f44a5b608a --- /dev/null +++ b/lib/ljsyscall/debian/changelog @@ -0,0 +1,5 @@ +lua-ljsyscall (0.12-1) unstable; urgency=medium + + * UNRELEASED + + -- John Doe Sun, 23 Jul 2017 19:43:15 +0200 diff --git a/lib/ljsyscall/debian/compat b/lib/ljsyscall/debian/compat new file mode 100644 index 0000000000..ec635144f6 --- /dev/null +++ b/lib/ljsyscall/debian/compat @@ -0,0 +1 @@ +9 diff --git a/lib/ljsyscall/debian/control b/lib/ljsyscall/debian/control new file mode 100644 index 0000000000..154d215980 --- /dev/null +++ b/lib/ljsyscall/debian/control @@ -0,0 +1,25 @@ +Source: lua-ljsyscall +Section: interpreters +Priority: optional +Maintainer: nobody +Build-Depends: debhelper (>= 9), dh-lua +Standards-Version: 4.0.0 +Homepage: http://www.myriabit.com/ljsyscall/ + +Package: lua-ljsyscall +Architecture: all +Pre-Depends: ${misc:Pre-Depends} +Depends: luajit, ${misc:Depends} +Provides: ${lua:Provides} +XB-Lua-Versions: ${lua:Versions} +Description: LuaJIT Linux syscall FFI + +Package: lua-ljsyscall-dev +Section: libdevel +Architecture: all +Pre-Depends: ${misc:Pre-Depends} +Depends: ${misc:Depends} +Provides: ${lua:Provides} +XB-Lua-Versions: ${lua:Versions} +Description: ljsyscall doc + This package contains the documentation of the ljsyscall library. diff --git a/lib/ljsyscall/debian/copyright b/lib/ljsyscall/debian/copyright new file mode 100644 index 0000000000..679922b416 --- /dev/null +++ b/lib/ljsyscall/debian/copyright @@ -0,0 +1,30 @@ +Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ +Upstream-Name: ljsyscall +Source: https://github.com/justincormack/ljsyscall + +Files: * +Copyright: Copyright (C) 2011-2016 Justin Cormack. All rights reserved. +License: Expat + +Files: */doc +Copyright: Copyright (C) 2011-2016 Justin Cormack. All rights reserved. +License: CC0 + +License: Expat + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + . + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + . + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. diff --git a/lib/ljsyscall/debian/dh-lua.conf b/lib/ljsyscall/debian/dh-lua.conf new file mode 100644 index 0000000000..19d2316714 --- /dev/null +++ b/lib/ljsyscall/debian/dh-lua.conf @@ -0,0 +1,24 @@ +### mandatory fields +LUA_VERSION=5.1 +PKG_NAME=ljsyscall + +### things relative to the C library part +CLIB_CFLAGS= +CLIB_LDFLAGS= +CLIB_LDFLAGS_STATIC= +CLIB_OBJS= +LUA_MODNAME_CPART= + +### things relative to the lua library part +LUA_HEADER= +LUA_SOURCES=syscall.lua syscall/*.lua syscall/shared/*.lua syscall/linux/*.lua syscall/linux/*/*.lua +LUA_SOURCES_MANGLER= +LUA_MODNAME=syscall +LUA_TEST= + +### this part is relative to pkg-config +PKG_VERSION= +PKG_LIBS_PRIVATE= +PKG_URL= +PKG_REQUIRES= +PKG_CONFLICTS= diff --git a/lib/ljsyscall/debian/lua-ljsyscall-dev.docs b/lib/ljsyscall/debian/lua-ljsyscall-dev.docs new file mode 100644 index 0000000000..ea60385cf8 --- /dev/null +++ b/lib/ljsyscall/debian/lua-ljsyscall-dev.docs @@ -0,0 +1,2 @@ +doc +test diff --git a/lib/ljsyscall/debian/patches/series b/lib/ljsyscall/debian/patches/series new file mode 100644 index 0000000000..e69de29bb2 diff --git a/lib/ljsyscall/debian/rules b/lib/ljsyscall/debian/rules new file mode 100755 index 0000000000..4f36696ce5 --- /dev/null +++ b/lib/ljsyscall/debian/rules @@ -0,0 +1,4 @@ +#!/usr/bin/make -f + +%: + dh $@ --buildsystem=lua --with lua diff --git a/lib/ljsyscall/debian/source/format b/lib/ljsyscall/debian/source/format new file mode 100644 index 0000000000..163aaf8d82 --- /dev/null +++ b/lib/ljsyscall/debian/source/format @@ -0,0 +1 @@ +3.0 (quilt) diff --git a/lib/ljsyscall/debian/tests/control b/lib/ljsyscall/debian/tests/control new file mode 100644 index 0000000000..cdb0fa9909 --- /dev/null +++ b/lib/ljsyscall/debian/tests/control @@ -0,0 +1,3 @@ +Tests: dh-lua-tests +Restrictions: rw-build-tree +Depends: @, dh-lua diff --git a/lib/ljsyscall/debian/tests/dh-lua-tests b/lib/ljsyscall/debian/tests/dh-lua-tests new file mode 100644 index 0000000000..738a2eb7ce --- /dev/null +++ b/lib/ljsyscall/debian/tests/dh-lua-tests @@ -0,0 +1 @@ +debian/rules autopkgtest diff --git a/lib/ljsyscall/debian/watch b/lib/ljsyscall/debian/watch new file mode 100644 index 0000000000..39da9e737d --- /dev/null +++ b/lib/ljsyscall/debian/watch @@ -0,0 +1,6 @@ +# test this watch file using: +# uscan --watchfile debian/watch --upstream-version 0.1 --package lua-ljsyscall +# +version=3 +opts=filenamemangle=s/.+\/v?(\d\S*)\.tar\.gz/ljsyscall-$1\.tar\.gz/ \ + https://github.com/justincormack/ljsyscall/releases .*/v?(\d\S*)\.tar\.gz diff --git a/lib/ljsyscall/docker-compose.test.yml b/lib/ljsyscall/docker-compose.test.yml new file mode 100644 index 0000000000..300cd4b981 --- /dev/null +++ b/lib/ljsyscall/docker-compose.test.yml @@ -0,0 +1,5 @@ +sut: + build: . + command: /test/test.lua + volumes: + - ./test:/test diff --git a/lib/ljsyscall/rockspec/ljsyscall-0.11-1.rockspec b/lib/ljsyscall/rockspec/ljsyscall-0.11-1.rockspec new file mode 100644 index 0000000000..2b66396d20 --- /dev/null +++ b/lib/ljsyscall/rockspec/ljsyscall-0.11-1.rockspec @@ -0,0 +1,170 @@ +package = "ljsyscall" +version = "0.11-1" +source = +{ + url = "https://github.com/justincormack/ljsyscall/archive/v0.11.tar.gz"; + dir = "ljsyscall-0.11"; +} + +description = +{ + summary = "LuaJIT Linux syscall FFI"; + homepage = "http://www.myriabit.com/ljsyscall/"; + license = "MIT"; +} +dependencies = +{ + "lua == 5.1"; -- In fact this should be "luajit >= 2.0.0" +} +build = +{ + type = "builtin"; + modules = + { + ["syscall"] = "syscall.lua"; + ["syscall.abi"] = "syscall/abi.lua"; + ["syscall.helpers"] = "syscall/helpers.lua"; + ["syscall.syscalls"] = "syscall/syscalls.lua"; + ["syscall.libc"] = "syscall/libc.lua"; + ["syscall.methods"] = "syscall/methods.lua"; + ["syscall.ffitypes"] = "syscall/ffitypes.lua"; + ["syscall.util"] = "syscall/util.lua"; + ["syscall.compat"] = "syscall/compat.lua"; + ["syscall.bit"] = "syscall/bit.lua"; + ["syscall.types"] = "syscall/types.lua"; + ["syscall.lfs"] = "syscall/lfs.lua"; + + ["syscall.shared.types"] = "syscall/shared/types.lua"; + }; + platforms = + { + linux = + { + modules = { + ["syscall.linux.syscalls"] = "syscall/linux/syscalls.lua"; + ["syscall.linux.c"] = "syscall/linux/c.lua"; + ["syscall.linux.constants"] = "syscall/linux/constants.lua"; + ["syscall.linux.ffi"] = "syscall/linux/ffi.lua"; + ["syscall.linux.ioctl"] = "syscall/linux/ioctl.lua"; + ["syscall.linux.types"] = "syscall/linux/types.lua"; + ["syscall.linux.fcntl"] = "syscall/linux/fcntl.lua"; + ["syscall.linux.errors"] = "syscall/linux/errors.lua"; + ["syscall.linux.util"] = "syscall/linux/util.lua"; + ["syscall.linux.nr"] = "syscall/linux/nr.lua"; + + ["syscall.linux.nl"] = "syscall/linux/nl.lua"; + ["syscall.linux.netfilter"] = "syscall/linux/netfilter.lua"; + ["syscall.linux.sockopt"] = "syscall/linux/sockopt.lua"; + ["syscall.linux.cgroup"] = "syscall/linux/cgroup.lua"; + + ["syscall.linux.arm.constants"] = "syscall/linux/arm/constants.lua"; + ["syscall.linux.arm.ffi"] = "syscall/linux/arm/ffi.lua"; + ["syscall.linux.arm.ioctl"] = "syscall/linux/arm/ioctl.lua"; + ["syscall.linux.arm.nr"] = "syscall/linux/arm/nr.lua"; + ["syscall.linux.arm64.constants"] = "syscall/linux/arm64/constants.lua"; + ["syscall.linux.arm64.ffi"] = "syscall/linux/arm64/ffi.lua"; + ["syscall.linux.arm64.ioctl"] = "syscall/linux/arm64/ioctl.lua"; + ["syscall.linux.arm64.nr"] = "syscall/linux/arm64/nr.lua"; + ["syscall.linux.mips.constants"] = "syscall/linux/mips/constants.lua"; + ["syscall.linux.mips.ffi"] = "syscall/linux/mips/ffi.lua"; + ["syscall.linux.mips.ioctl"] = "syscall/linux/mips/ioctl.lua"; + ["syscall.linux.mips.nr"] = "syscall/linux/mips/nr.lua"; + ["syscall.linux.ppc.constants"] = "syscall/linux/ppc/constants.lua"; + ["syscall.linux.ppc.ffi"] = "syscall/linux/ppc/ffi.lua"; + ["syscall.linux.ppc.ioctl"] = "syscall/linux/ppc/ioctl.lua"; + ["syscall.linux.ppc.nr"] = "syscall/linux/ppc/nr.lua"; + ["syscall.linux.ppc64le.constants"] = "syscall/linux/ppc64le/constants.lua"; + ["syscall.linux.ppc64le.ffi"] = "syscall/linux/ppc64le/ffi.lua"; + ["syscall.linux.ppc64le.ioctl"] = "syscall/linux/ppc64le/ioctl.lua"; + ["syscall.linux.ppc64le.nr"] = "syscall/linux/ppc64le/nr.lua"; + ["syscall.linux.x64.constants"] = "syscall/linux/x64/constants.lua"; + ["syscall.linux.x64.ffi"] = "syscall/linux/x64/ffi.lua"; + ["syscall.linux.x64.ioctl"] = "syscall/linux/x64/ioctl.lua"; + ["syscall.linux.x64.nr"] = "syscall/linux/x64/nr.lua"; + ["syscall.linux.x86.constants"] = "syscall/linux/x86/constants.lua"; + ["syscall.linux.x86.ffi"] = "syscall/linux/x86/ffi.lua"; + ["syscall.linux.x86.ioctl"] = "syscall/linux/x86/ioctl.lua"; + ["syscall.linux.x86.nr"] = "syscall/linux/x86/nr.lua"; + } + }; + macosx = + { + modules = + { + ["syscall.osx.syscalls"] = "syscall/osx/syscalls.lua"; + ["syscall.osx.c"] = "syscall/osx/c.lua"; + ["syscall.osx.constants"] = "syscall/osx/constants.lua"; + ["syscall.osx.ffi"] = "syscall/osx/ffi.lua"; + ["syscall.osx.ioctl"] = "syscall/osx/ioctl.lua"; + ["syscall.osx.types"] = "syscall/osx/types.lua"; + ["syscall.osx.fcntl"] = "syscall/osx/fcntl.lua"; + ["syscall.osx.errors"] = "syscall/osx/errors.lua"; + ["syscall.osx.util"] = "syscall/osx/util.lua"; + ["syscall.osx.sysctl"] = "syscall/osx/sysctl.lua"; + } + }; + freebsd = + { + modules = + { + ["syscall.freebsd.syscalls"] = "syscall/freebsd/syscalls.lua"; + ["syscall.freebsd.c"] = "syscall/freebsd/c.lua"; + ["syscall.freebsd.constants"] = "syscall/freebsd/constants.lua"; + ["syscall.freebsd.ffi"] = "syscall/freebsd/ffi.lua"; + ["syscall.freebsd.ioctl"] = "syscall/freebsd/ioctl.lua"; + ["syscall.freebsd.types"] = "syscall/freebsd/types.lua"; + ["syscall.freebsd.fcntl"] = "syscall/freebsd/fcntl.lua"; + ["syscall.freebsd.errors"] = "syscall/freebsd/errors.lua"; + ["syscall.freebsd.util"] = "syscall/freebsd/util.lua"; + ["syscall.freebsd.version"] = "syscall/freebsd/version.lua"; + ["syscall.freebsd.sysctl"] = "syscall/freebsd/sysctl.lua"; + } + }; + netbsd = + { + modules = + { + ["syscall.netbsd.syscalls"] = "syscall/netbsd/syscalls.lua"; + ["syscall.netbsd.c"] = "syscall/netbsd/c.lua"; + ["syscall.netbsd.constants"] = "syscall/netbsd/constants.lua"; + ["syscall.netbsd.ffitypes"] = "syscall/netbsd/ffitypes.lua"; + ["syscall.netbsd.ffifunctions"] = "syscall/netbsd/ffifunctions.lua"; + ["syscall.netbsd.ioctl"] = "syscall/netbsd/ioctl.lua"; + ["syscall.netbsd.types"] = "syscall/netbsd/types.lua"; + ["syscall.netbsd.fcntl"] = "syscall/netbsd/fcntl.lua"; + ["syscall.netbsd.errors"] = "syscall/netbsd/errors.lua"; + ["syscall.netbsd.util"] = "syscall/netbsd/util.lua"; + ["syscall.netbsd.nr"] = "syscall/netbsd/nr.lua"; + ["syscall.netbsd.init"] = "syscall/netbsd/init.lua"; + ["syscall.netbsd.version"] = "syscall/netbsd/version.lua"; + ["syscall.netbsd.sysctl"] = "syscall/netbsd/sysctl.lua"; + } + }; + openbsd = + { + modules = + { + ["syscall.openbsd.syscalls"] = "syscall/openbsd/syscalls.lua"; + ["syscall.openbsd.c"] = "syscall/openbsd/c.lua"; + ["syscall.openbsd.constants"] = "syscall/openbsd/constants.lua"; + ["syscall.openbsd.ffi"] = "syscall/openbsd/ffi.lua"; + ["syscall.openbsd.ioctl"] = "syscall/openbsd/ioctl.lua"; + ["syscall.openbsd.types"] = "syscall/openbsd/types.lua"; + ["syscall.openbsd.fcntl"] = "syscall/openbsd/fcntl.lua"; + ["syscall.openbsd.errors"] = "syscall/openbsd/errors.lua"; + ["syscall.openbsd.util"] = "syscall/openbsd/util.lua"; + ["syscall.openbsd.version"] = "syscall/openbsd/version.lua"; + ["syscall.openbsd.sysctl"] = "syscall/openbsd/sysctl.lua"; + } + }; + bsd = + { + modules = + { + ["syscall.bsd.syscalls"] = "syscall/bsd/syscalls.lua"; + ["syscall.bsd.ffi"] = "syscall/bsd/ffi.lua"; + ["syscall.bsd.types"] = "syscall/bsd/types.lua"; + } + }; + } +} diff --git a/lib/ljsyscall/rockspec/ljsyscall-0.12-1.rockspec b/lib/ljsyscall/rockspec/ljsyscall-0.12-1.rockspec new file mode 100644 index 0000000000..f614e71605 --- /dev/null +++ b/lib/ljsyscall/rockspec/ljsyscall-0.12-1.rockspec @@ -0,0 +1,170 @@ +package = "ljsyscall" +version = "0.12-1" +source = +{ + url = "https://github.com/justincormack/ljsyscall/archive/v0.12.tar.gz"; + dir = "ljsyscall-0.12"; +} + +description = +{ + summary = "LuaJIT Linux syscall FFI"; + homepage = "http://www.myriabit.com/ljsyscall/"; + license = "MIT"; +} +dependencies = +{ + "lua == 5.1"; -- In fact this should be "luajit >= 2.0.0" +} +build = +{ + type = "builtin"; + modules = + { + ["syscall"] = "syscall.lua"; + ["syscall.abi"] = "syscall/abi.lua"; + ["syscall.helpers"] = "syscall/helpers.lua"; + ["syscall.syscalls"] = "syscall/syscalls.lua"; + ["syscall.libc"] = "syscall/libc.lua"; + ["syscall.methods"] = "syscall/methods.lua"; + ["syscall.ffitypes"] = "syscall/ffitypes.lua"; + ["syscall.util"] = "syscall/util.lua"; + ["syscall.compat"] = "syscall/compat.lua"; + ["syscall.bit"] = "syscall/bit.lua"; + ["syscall.types"] = "syscall/types.lua"; + ["syscall.lfs"] = "syscall/lfs.lua"; + + ["syscall.shared.types"] = "syscall/shared/types.lua"; + }; + platforms = + { + linux = + { + modules = { + ["syscall.linux.syscalls"] = "syscall/linux/syscalls.lua"; + ["syscall.linux.c"] = "syscall/linux/c.lua"; + ["syscall.linux.constants"] = "syscall/linux/constants.lua"; + ["syscall.linux.ffi"] = "syscall/linux/ffi.lua"; + ["syscall.linux.ioctl"] = "syscall/linux/ioctl.lua"; + ["syscall.linux.types"] = "syscall/linux/types.lua"; + ["syscall.linux.fcntl"] = "syscall/linux/fcntl.lua"; + ["syscall.linux.errors"] = "syscall/linux/errors.lua"; + ["syscall.linux.util"] = "syscall/linux/util.lua"; + ["syscall.linux.nr"] = "syscall/linux/nr.lua"; + + ["syscall.linux.nl"] = "syscall/linux/nl.lua"; + ["syscall.linux.netfilter"] = "syscall/linux/netfilter.lua"; + ["syscall.linux.sockopt"] = "syscall/linux/sockopt.lua"; + ["syscall.linux.cgroup"] = "syscall/linux/cgroup.lua"; + + ["syscall.linux.arm.constants"] = "syscall/linux/arm/constants.lua"; + ["syscall.linux.arm.ffi"] = "syscall/linux/arm/ffi.lua"; + ["syscall.linux.arm.ioctl"] = "syscall/linux/arm/ioctl.lua"; + ["syscall.linux.arm.nr"] = "syscall/linux/arm/nr.lua"; + ["syscall.linux.arm64.constants"] = "syscall/linux/arm64/constants.lua"; + ["syscall.linux.arm64.ffi"] = "syscall/linux/arm64/ffi.lua"; + ["syscall.linux.arm64.ioctl"] = "syscall/linux/arm64/ioctl.lua"; + ["syscall.linux.arm64.nr"] = "syscall/linux/arm64/nr.lua"; + ["syscall.linux.mips.constants"] = "syscall/linux/mips/constants.lua"; + ["syscall.linux.mips.ffi"] = "syscall/linux/mips/ffi.lua"; + ["syscall.linux.mips.ioctl"] = "syscall/linux/mips/ioctl.lua"; + ["syscall.linux.mips.nr"] = "syscall/linux/mips/nr.lua"; + ["syscall.linux.ppc.constants"] = "syscall/linux/ppc/constants.lua"; + ["syscall.linux.ppc.ffi"] = "syscall/linux/ppc/ffi.lua"; + ["syscall.linux.ppc.ioctl"] = "syscall/linux/ppc/ioctl.lua"; + ["syscall.linux.ppc.nr"] = "syscall/linux/ppc/nr.lua"; + ["syscall.linux.ppc64le.constants"] = "syscall/linux/ppc64le/constants.lua"; + ["syscall.linux.ppc64le.ffi"] = "syscall/linux/ppc64le/ffi.lua"; + ["syscall.linux.ppc64le.ioctl"] = "syscall/linux/ppc64le/ioctl.lua"; + ["syscall.linux.ppc64le.nr"] = "syscall/linux/ppc64le/nr.lua"; + ["syscall.linux.x64.constants"] = "syscall/linux/x64/constants.lua"; + ["syscall.linux.x64.ffi"] = "syscall/linux/x64/ffi.lua"; + ["syscall.linux.x64.ioctl"] = "syscall/linux/x64/ioctl.lua"; + ["syscall.linux.x64.nr"] = "syscall/linux/x64/nr.lua"; + ["syscall.linux.x86.constants"] = "syscall/linux/x86/constants.lua"; + ["syscall.linux.x86.ffi"] = "syscall/linux/x86/ffi.lua"; + ["syscall.linux.x86.ioctl"] = "syscall/linux/x86/ioctl.lua"; + ["syscall.linux.x86.nr"] = "syscall/linux/x86/nr.lua"; + } + }; + macosx = + { + modules = + { + ["syscall.osx.syscalls"] = "syscall/osx/syscalls.lua"; + ["syscall.osx.c"] = "syscall/osx/c.lua"; + ["syscall.osx.constants"] = "syscall/osx/constants.lua"; + ["syscall.osx.ffi"] = "syscall/osx/ffi.lua"; + ["syscall.osx.ioctl"] = "syscall/osx/ioctl.lua"; + ["syscall.osx.types"] = "syscall/osx/types.lua"; + ["syscall.osx.fcntl"] = "syscall/osx/fcntl.lua"; + ["syscall.osx.errors"] = "syscall/osx/errors.lua"; + ["syscall.osx.util"] = "syscall/osx/util.lua"; + ["syscall.osx.sysctl"] = "syscall/osx/sysctl.lua"; + } + }; + freebsd = + { + modules = + { + ["syscall.freebsd.syscalls"] = "syscall/freebsd/syscalls.lua"; + ["syscall.freebsd.c"] = "syscall/freebsd/c.lua"; + ["syscall.freebsd.constants"] = "syscall/freebsd/constants.lua"; + ["syscall.freebsd.ffi"] = "syscall/freebsd/ffi.lua"; + ["syscall.freebsd.ioctl"] = "syscall/freebsd/ioctl.lua"; + ["syscall.freebsd.types"] = "syscall/freebsd/types.lua"; + ["syscall.freebsd.fcntl"] = "syscall/freebsd/fcntl.lua"; + ["syscall.freebsd.errors"] = "syscall/freebsd/errors.lua"; + ["syscall.freebsd.util"] = "syscall/freebsd/util.lua"; + ["syscall.freebsd.version"] = "syscall/freebsd/version.lua"; + ["syscall.freebsd.sysctl"] = "syscall/freebsd/sysctl.lua"; + } + }; + netbsd = + { + modules = + { + ["syscall.netbsd.syscalls"] = "syscall/netbsd/syscalls.lua"; + ["syscall.netbsd.c"] = "syscall/netbsd/c.lua"; + ["syscall.netbsd.constants"] = "syscall/netbsd/constants.lua"; + ["syscall.netbsd.ffitypes"] = "syscall/netbsd/ffitypes.lua"; + ["syscall.netbsd.ffifunctions"] = "syscall/netbsd/ffifunctions.lua"; + ["syscall.netbsd.ioctl"] = "syscall/netbsd/ioctl.lua"; + ["syscall.netbsd.types"] = "syscall/netbsd/types.lua"; + ["syscall.netbsd.fcntl"] = "syscall/netbsd/fcntl.lua"; + ["syscall.netbsd.errors"] = "syscall/netbsd/errors.lua"; + ["syscall.netbsd.util"] = "syscall/netbsd/util.lua"; + ["syscall.netbsd.nr"] = "syscall/netbsd/nr.lua"; + ["syscall.netbsd.init"] = "syscall/netbsd/init.lua"; + ["syscall.netbsd.version"] = "syscall/netbsd/version.lua"; + ["syscall.netbsd.sysctl"] = "syscall/netbsd/sysctl.lua"; + } + }; + openbsd = + { + modules = + { + ["syscall.openbsd.syscalls"] = "syscall/openbsd/syscalls.lua"; + ["syscall.openbsd.c"] = "syscall/openbsd/c.lua"; + ["syscall.openbsd.constants"] = "syscall/openbsd/constants.lua"; + ["syscall.openbsd.ffi"] = "syscall/openbsd/ffi.lua"; + ["syscall.openbsd.ioctl"] = "syscall/openbsd/ioctl.lua"; + ["syscall.openbsd.types"] = "syscall/openbsd/types.lua"; + ["syscall.openbsd.fcntl"] = "syscall/openbsd/fcntl.lua"; + ["syscall.openbsd.errors"] = "syscall/openbsd/errors.lua"; + ["syscall.openbsd.util"] = "syscall/openbsd/util.lua"; + ["syscall.openbsd.version"] = "syscall/openbsd/version.lua"; + ["syscall.openbsd.sysctl"] = "syscall/openbsd/sysctl.lua"; + } + }; + bsd = + { + modules = + { + ["syscall.bsd.syscalls"] = "syscall/bsd/syscalls.lua"; + ["syscall.bsd.ffi"] = "syscall/bsd/ffi.lua"; + ["syscall.bsd.types"] = "syscall/bsd/types.lua"; + } + }; + } +} diff --git a/lib/ljsyscall/syscall/bsd/ffi.lua b/lib/ljsyscall/syscall/bsd/ffi.lua index a09b59816f..a0fed50a4f 100644 --- a/lib/ljsyscall/syscall/bsd/ffi.lua +++ b/lib/ljsyscall/syscall/bsd/ffi.lua @@ -147,7 +147,6 @@ int mkfifoat(int dirfd, const char *pathname, mode_t mode); int fchmodat(int dirfd, const char *pathname, mode_t mode, int flags); int readlinkat(int dirfd, const char *pathname, char *buf, size_t bufsiz); int faccessat(int dirfd, const char *pathname, int mode, int flags); -int fstatat(int dirfd, const char *pathname, struct stat *buf, int flags); int futimens(int fd, const struct timespec times[2]); int utimensat(int dirfd, const char *pathname, const struct timespec times[2], int flags); diff --git a/lib/ljsyscall/syscall/freebsd/constants.lua b/lib/ljsyscall/syscall/freebsd/constants.lua index 305a8cb151..b1a703da03 100644 --- a/lib/ljsyscall/syscall/freebsd/constants.lua +++ b/lib/ljsyscall/syscall/freebsd/constants.lua @@ -1335,5 +1335,13 @@ c.CAP_RIGHTS_VERSION = 0 -- we do not understand others end -- freebsd >= 10 +if version >= 11 then +-- for utimensat +c.UTIME = strflag { + NOW = -1, + OMIT = -2, +} +end + return c diff --git a/lib/ljsyscall/syscall/freebsd/ffi.lua b/lib/ljsyscall/syscall/freebsd/ffi.lua index 872237e771..fecc9f509b 100644 --- a/lib/ljsyscall/syscall/freebsd/ffi.lua +++ b/lib/ljsyscall/syscall/freebsd/ffi.lua @@ -297,6 +297,7 @@ int cap_ioctls_limit(int fd, const unsigned long *cmds, size_t ncmds); ssize_t cap_ioctls_get(int fd, unsigned long *cmds, size_t maxcmds); int cap_fcntls_limit(int fd, uint32_t fcntlrights); int cap_fcntls_get(int fd, uint32_t *fcntlrightsp); +int fstatat(int dirfd, const char *pathname, struct stat *buf, int flags); int __sys_utimes(const char *filename, const struct timeval times[2]); int __sys_futimes(int, const struct timeval times[2]); diff --git a/lib/ljsyscall/syscall/linux/arm/nr.lua b/lib/ljsyscall/syscall/linux/arm/nr.lua index 058c8158d1..7fdb66d58a 100644 --- a/lib/ljsyscall/syscall/linux/arm/nr.lua +++ b/lib/ljsyscall/syscall/linux/arm/nr.lua @@ -338,6 +338,15 @@ local nr = { setns = 375, process_vm_readv = 376, process_vm_writev= 377, + kcmp = 378, + finit_module = 379, + sched_setattr = 380, + sched_getattr = 381, + renameat2 = 382, + seccomp = 383, + getrandom = 384, + memfd_create = 385, + bpf = 386, } } diff --git a/lib/ljsyscall/syscall/linux/c.lua b/lib/ljsyscall/syscall/linux/c.lua index 33c09e1425..e52a771a1a 100644 --- a/lib/ljsyscall/syscall/linux/c.lua +++ b/lib/ljsyscall/syscall/linux/c.lua @@ -7,12 +7,8 @@ Note a fair number are being deprecated, see include/uapi/asm-generic/unistd.h u Some of these we already don't use, but some we do, eg use open not openat etc. ]] -local require, error, assert, tonumber, tostring, -setmetatable, pairs, ipairs, unpack, rawget, rawset, -pcall, type, table, string, select = -require, error, assert, tonumber, tostring, -setmetatable, pairs, ipairs, unpack, rawget, rawset, -pcall, type, table, string, select +local require, tonumber, pcall, select = +require, tonumber, pcall, select local abi = require "syscall.abi" @@ -34,7 +30,6 @@ local uint, ulong = ffi.typeof("unsigned int"), ffi.typeof("unsigned long") local h = require "syscall.helpers" local err64 = h.err64 -local errpointer = h.errpointer local i6432, u6432 = bit.i6432, bit.u6432 @@ -53,7 +48,6 @@ else arg64u = function(val) return u6432(val) end end -- _llseek very odd, preadv -local function llarg64u(val) return u6432(val) end local function llarg64(val) return i6432(val) end local C = {} @@ -69,7 +63,6 @@ local u64 = ffi.typeof("uint64_t") -- TODO could make these return errno here, also are these best casts? local syscall_long = ffi.C.syscall -- returns long local function syscall(...) return tonumber(syscall_long(...)) end -- int is default as most common -local function syscall_uint(...) return uint(syscall_long(...)) end local function syscall_void(...) return void(syscall_long(...)) end local function syscall_off(...) return u64(syscall_long(...)) end -- off_t @@ -182,6 +175,7 @@ end -- glibc caches pid, but this fails to work eg after clone(). function C.getpid() return syscall(sys.getpid) end +function C.gettid() return syscall(sys.gettid) end -- underlying syscalls function C.exit_group(status) return syscall(sys.exit_group, int(status)) end -- void return really @@ -641,6 +635,7 @@ end function C.timer_gettime(timerid, curr_value) return syscall(sys.timer_gettime, int(timerid), void(curr_value)) end function C.timer_delete(timerid) return syscall(sys.timer_delete, int(timerid)) end function C.timer_getoverrun(timerid) return syscall(sys.timer_getoverrun, int(timerid)) end +function C.vhangup() return syscall(sys.vhangup) end -- only on some architectures if sys.waitpid then @@ -701,6 +696,18 @@ if sys.time then function C.time(t) return syscall(sys.time, void(t)) end end +-- bpf syscall that is only on Linux 3.19+ +if sys.bpf then + function C.bpf(cmd, attr) + return syscall(sys.bpf, int(cmd), void(attr), u64(ffi.sizeof('union bpf_attr'))) + end +end +if sys.perf_event_open then + function C.perf_event_open(attr, pid, cpu, group_fd, flags) + return syscall(sys.perf_event_open, void(attr), int(pid), int(cpu), int(group_fd), ulong(flags)) + end +end + -- socketcalls if not sys.socketcall then function C.socket(domain, tp, protocol) return syscall(sys.socket, int(domain), int(tp), int(protocol)) end diff --git a/lib/ljsyscall/syscall/linux/constants.lua b/lib/ljsyscall/syscall/linux/constants.lua index 3497885f92..ec0c32807b 100644 --- a/lib/ljsyscall/syscall/linux/constants.lua +++ b/lib/ljsyscall/syscall/linux/constants.lua @@ -161,9 +161,12 @@ c.F = strflag(arch.F or { SETLEASE = 1024, GETLEASE = 1025, NOTIFY = 1026, + CANCELLK = 1029, + DUPFD_CLOEXEC = 1030, SETPIPE_SZ = 1031, GETPIPE_SZ = 1032, - DUPFD_CLOEXEC = 1030, + ADD_SEALS = 1033, + GET_SEALS = 1034, }) -- messy @@ -208,6 +211,14 @@ c.LOCK = multiflags { RW = 192, } +-- for memfd +c.F_SEAL = multiflags { + SEAL = 0x0001, + SHRINK = 0x0002, + GROW = 0x0004, + WRITE = 0x0008, +} + --mmap c.PROT = multiflags { NONE = 0x0, @@ -300,6 +311,8 @@ c.SEEK = strflag { SET = 0, CUR = 1, END = 2, + DATA = 3, + HOLE = 4, } -- exit @@ -393,6 +406,12 @@ c.SOCK = multiflags(arch.SOCK or { c.SCM = strflag { RIGHTS = 0x01, CREDENTIALS = 0x02, + + TSTAMP_SND = 0, + TSTAMP_SCHED = 1, + TSTAMP_ACK = 2, + + TIMESTAMPING_OPT_STATS = 54, } -- setsockopt @@ -407,6 +426,7 @@ c.SOL = strflag { ATM = 264, AAL = 265, IRDA = 266, + XDP = 283 } if arch.SOLSOCKET then c.SOL.SOCKET = arch.SOLSOCKET else c.SOL.SOCKET = 1 end @@ -426,7 +446,7 @@ c.SO = strflag(arch.SO or { PRIORITY = 12, LINGER = 13, BSDCOMPAT = 14, ---REUSEPORT = 15, -- new, may not be defined yet + REUSEPORT = 15, -- new, may not be defined yet PASSCRED = 16, PEERCRED = 17, RCVLOWAT = 18, @@ -455,13 +475,57 @@ c.SO = strflag(arch.SO or { WIFI_STATUS = 41, PEEK_OFF = 42, NOFCS = 43, + LOCK_FILTER = 44, + SELECT_ERR_QUEUE = 45, + BUSY_POLL = 46, + MAX_PACING_RATE = 47, + BPF_EXTENSIONS = 48, + INCOMING_CPU = 49, + ATTACH_BPF = 50, + ATTACH_REUSEPORT_CBPF = 51, + ATTACH_REUSEPORT_EBPF = 52, + XDP_MMAP_OFFSETS = 1, + XDP_RX_RING = 2, + XDP_TX_RING = 3, + XDP_UMEM_REG = 4, + XDP_UMEM_FILL_RING = 5, + XDP_UMEM_COMPLETION_RING = 6, + XDP_STATISTICS = 7, + XDP_OPTIONS = 8 }) c.SO.GET_FILTER = c.SO.ATTACH_FILTER +c.SO.DETACH_BPF = c.SO.DETACH_FILTER + +c.SCM.TIMESTAMP = c.SO.TIMESTAMP +c.SCM.TIMESTAMPNS = c.SO.TIMESTAMPNS +c.SCM.TIMESTAMPING = c.SO.TIMESTAMPING -- Maximum queue length specifiable by listen. c.SOMAXCONN = 128 +c.SOF = strflag { + TIMESTAMPING_TX_HARDWARE = bit.lshift(1, 0), + TIMESTAMPING_TX_SOFTWARE = bit.lshift(1, 1), + TIMESTAMPING_RX_HARDWARE = bit.lshift(1, 2), + TIMESTAMPING_RX_SOFTWARE = bit.lshift(1, 3), + TIMESTAMPING_SOFTWARE = bit.lshift(1, 4), + TIMESTAMPING_SYS_HARDWARE = bit.lshift(1, 5), + TIMESTAMPING_RAW_HARDWARE = bit.lshift(1, 6), + TIMESTAMPING_OPT_ID = bit.lshift(1, 7), + TIMESTAMPING_TX_SCHED = bit.lshift(1, 8), + TIMESTAMPING_TX_ACK = bit.lshift(1, 9), + TIMESTAMPING_OPT_CMSG = bit.lshift(1, 10), + TIMESTAMPING_OPT_TSONLY = bit.lshift(1, 11), + TIMESTAMPING_OPT_STATS = bit.lshift(1, 12), + TIMESTAMPING_OPT_PKTINFO = bit.lshift(1, 13), + TIMESTAMPING_OPT_TX_SWHW = bit.lshift(1, 14), +} + +c.SOF.TIMESTAMPING_LAST = c.SOF.TIMESTAMPING_OPT_TX_SWHW +c.SOF.TIMESTAMPING_MASK = bit.bor(c.SOF.TIMESTAMPING_LAST - 1, + c.SOF.TIMESTAMPING_LAST) + -- shutdown c.SHUT = strflag { RD = 0, @@ -1162,6 +1226,10 @@ c.RTA = strflag { MP_ALGO = 14, TABLE = 15, MARK = 16, + MFC_STATS = 17, + VIA = 18, + NEWDST = 19, + PREF = 20, } -- route flags @@ -1374,6 +1442,7 @@ c.AF = strflag { CAIF = 37, ALG = 38, NFC = 39, + XDP = 44 } c.AF.UNIX = c.AF.LOCAL @@ -1954,6 +2023,7 @@ c.EM = strflag { MN10300 = 89, BLACKFIN = 106, TI_C6000 = 140, + AARCH64 = 183, FRV = 0x5441, AVR32 = 0x18ad, ALPHA = 0x9026, @@ -1970,6 +2040,7 @@ local __AUDIT_ARCH_64BIT = 0x80000000 local __AUDIT_ARCH_LE = 0x40000000 c.AUDIT_ARCH = strflag { + AARCH64 = c.EM.AARCH64 + __AUDIT_ARCH_64BIT + __AUDIT_ARCH_LE, ALPHA = c.EM.ALPHA + __AUDIT_ARCH_64BIT + __AUDIT_ARCH_LE, ARM = c.EM.ARM + __AUDIT_ARCH_LE, ARMEB = c.EM.ARM, @@ -2007,6 +2078,7 @@ c.BPF = multiflags { ST = 0x02, STX = 0x03, ALU = 0x04, + ALU64 = 0x07, JMP = 0x05, RET = 0x06, MISC = 0x07, @@ -2014,6 +2086,7 @@ c.BPF = multiflags { W = 0x00, H = 0x08, B = 0x10, + DW = 0x18, -- mode IMM = 0x00, ABS = 0x20, @@ -2030,12 +2103,23 @@ c.BPF = multiflags { AND = 0x50, LSH = 0x60, RSH = 0x70, + ARSH = 0xc0, NEG = 0x80, + MOD = 0x90, + XOR = 0xa0, + MOV = 0xb0, + XADD = 0xc0, + END = 0xd0, JA = 0x00, JEQ = 0x10, JGT = 0x20, JGE = 0x30, JSET = 0x40, + JNE = 0x50, + JSGT = 0x60, + JSGE = 0x70, + CALL = 0x80, + EXIT = 0x90, -- src K = 0x00, X = 0x08, @@ -2044,6 +2128,245 @@ c.BPF = multiflags { -- miscop TAX = 0x00, TXA = 0x80, + TO_LE = 0x00, + TO_BE = 0x08, +-- flags + ANY = 0, + NOEXIST = 1, + EXIST = 2, +} + +-- BPF map type +c.BPF_MAP = strflag { + UNSPEC = 0, + HASH = 1, + ARRAY = 2, + PROG_ARRAY = 3, + PERF_EVENT_ARRAY = 4, + PERCPU_HASH = 5, + PERCPU_ARRAY = 6, + STACK_TRACE = 7, + CGROUP_ARRAY = 8, + LRU_HASH = 9, + LRU_PERCPU_HASH = 10, + LPM_TRIE = 11, + ARRAY_OF_MAPS = 12, + HASH_OF_MAPS = 13, + DEVMAP = 14, + SOCKMAP = 15, + CPUMAP = 16, + XSKMAP = 17 +} + +-- BPF syscall commands +c.BPF_CMD = strflag { + MAP_CREATE = 0, + MAP_LOOKUP_ELEM = 1, + MAP_UPDATE_ELEM = 2, + MAP_DELETE_ELEM = 3, + MAP_GET_NEXT_KEY = 4, + PROG_LOAD = 5, + OBJ_PIN = 6, + OBJ_GET = 7, + PROG_ATTACH = 8, + PROG_DETACH = 9, + PROG_TEST_RUN = 10, + PROG_GET_NEXT_ID = 11, + MAP_GET_NEXT_ID = 12, + PROG_GET_FD_BY_ID = 13, + MAP_GET_FD_BY_ID = 14, + OBJ_GET_INFO_BY_FD = 15, + PROG_QUERY = 16, + RAW_TRACEPOINT_OPEN = 17, +} + +-- BPF program types +c.BPF_PROG = strflag { + UNSPEC = 0, + SOCKET_FILTER = 1, + KPROBE = 2, + SCHED_CLS = 3, + SCHED_ACT = 4, + TRACEPOINT = 5, + XDP = 6, + PERF_EVENT = 7, + CGROUP_SKB = 8, + CGROUP_SOCK = 9, + LWT_IN = 10, + LWT_OUT = 11, + LWT_XMIT = 12, + SOCK_OPS = 13, + SK_SKB = 14, + CGROUP_DEVICE = 15, + SK_MSG = 16, + RAW_TRACEPOINT = 17, + CGROUP_SOCK_ADDR = 18, +} + +-- BPF attach type +c.BPF_ATTACH_TYPE = strflag { + CGROUP_INET_INGRESS = 0, + CGROUP_INET_EGRESS = 1, + CGROUP_INET_SOCK_CREATE = 2, + CGROUP_SOCK_OPS = 3, + SK_SKB_STREAM_PARSER = 4, + SK_SKB_STREAM_VERDICT = 5, + CGROUP_DEVICE = 6, + SK_MSG_VERDICT = 7, + CGROUP_INET4_BIND = 8, + CGROUP_INET6_BIND = 9, + CGROUP_INET4_CONNECT = 10, + CGROUP_INET6_CONNECT = 11, + CGROUP_INET4_POST_BIND = 12, + CGROUP_INET6_POST_BIND = 13, +} + +-- Linux performance monitoring +-- perf_event_attr.type +c.PERF_TYPE = strflag { + HARDWARE = 0, + SOFTWARE = 1, + TRACEPOINT = 2, + HW_CACHE = 3, + RAW = 4, + BREAKPOINT = 5, +} + +-- perf_event_attr.event_id +c.PERF_COUNT = strflag { + -- Generalized performance event event_id types + HW_CPU_CYCLES = 0, + HW_INSTRUCTIONS = 1, + HW_CACHE_REFERENCES = 2, + HW_CACHE_MISSES = 3, + HW_BRANCH_INSTRUCTIONS = 4, + HW_BRANCH_MISSES = 5, + HW_BUS_CYCLES = 6, + HW_STALLED_CYCLES_FRONTEND = 7, + HW_STALLED_CYCLES_BACKEND = 8, + HW_REF_CPU_CYCLES = 9, + -- Generalized hardware cache events + HW_CACHE_L1D = 0, + HW_CACHE_L1I = 1, + HW_CACHE_LL = 2, + HW_CACHE_DTLB = 3, + HW_CACHE_ITLB = 4, + HW_CACHE_BPU = 5, + HW_CACHE_NODE = 6, + HW_CACHE_OP_READ = 0, + HW_CACHE_OP_WRITE = 1, + HW_CACHE_OP_PREFETCH = 2, + HW_CACHE_RESULT_ACCESS = 0, + HW_CACHE_RESULT_MISS = 1, + -- Special "software" events provided by the kernel + SW_CPU_CLOCK = 0, + SW_TASK_CLOCK = 1, + SW_PAGE_FAULTS = 2, + SW_CONTEXT_SWITCHES = 3, + SW_CPU_MIGRATIONS = 4, + SW_PAGE_FAULTS_MIN = 5, + SW_PAGE_FAULTS_MAJ = 6, + SW_ALIGNMENT_FAULTS = 7, + SW_EMULATION_FAULTS = 8, + SW_DUMMY = 9, + SW_BPF_OUTPUT = 10, +} + +-- Bits that can be set in perf_event_attr.sample_type to request information +c.PERF_SAMPLE = multiflags { + IP = bit.lshift(1, 0), + TID = bit.lshift(1, 1), + TIME = bit.lshift(1, 2), + ADDR = bit.lshift(1, 3), + READ = bit.lshift(1, 4), + CALLCHAIN = bit.lshift(1, 5), + ID = bit.lshift(1, 6), + CPU = bit.lshift(1, 7), + PERIOD = bit.lshift(1, 8), + STREAM_ID = bit.lshift(1, 9), + RAW = bit.lshift(1, 10), + BRANCH_STACK = bit.lshift(1, 11), + REGS_USER = bit.lshift(1, 12), + STACK_USER = bit.lshift(1, 13), + WEIGHT = bit.lshift(1, 14), + DATA_SRC = bit.lshift(1, 15), + IDENTIFIER = bit.lshift(1, 16), + TRANSACTION = bit.lshift(1, 17), + REGS_INTR = bit.lshift(1, 18), +} + +-- values to program into perf_event_attr.branch_sample_type when PERF_SAMPLE_BRANCH is set +c.PERF_SAMPLE_BRANCH = multiflags { + USER_SHIFT = 0, + KERNEL_SHIFT = 1, + HV_SHIFT = 2, + ANY_SHIFT = 3, + ANY_CALL_SHIFT = 4, + ANY_RETURN_SHIFT = 5, + IND_CALL_SHIFT = 6, + ABORT_TX_SHIFT = 7, + IN_TX_SHIFT = 8, + NO_TX_SHIFT = 9, + COND_SHIFT = 10, + CALL_STACK_SHIFT = 11, + IND_JUMP_SHIFT = 12, + CALL_SHIFT = 13, + NO_FLAGS_SHIFT = 14, + NO_CYCLES_SHIFT = 15, +} +c.PERF_SAMPLE_BRANCH.USER = bit.lshift(1, c.PERF_SAMPLE_BRANCH.USER_SHIFT) +c.PERF_SAMPLE_BRANCH.KERNEL = bit.lshift(1, c.PERF_SAMPLE_BRANCH.KERNEL_SHIFT) +c.PERF_SAMPLE_BRANCH.HV = bit.lshift(1, c.PERF_SAMPLE_BRANCH.HV_SHIFT) +c.PERF_SAMPLE_BRANCH.ANY = bit.lshift(1, c.PERF_SAMPLE_BRANCH.ANY_SHIFT) +c.PERF_SAMPLE_BRANCH.ANY_CALL = bit.lshift(1, c.PERF_SAMPLE_BRANCH.ANY_CALL_SHIFT) +c.PERF_SAMPLE_BRANCH.ANY_RETURN = bit.lshift(1, c.PERF_SAMPLE_BRANCH.ANY_RETURN_SHIFT) +c.PERF_SAMPLE_BRANCH.IND_CALL = bit.lshift(1, c.PERF_SAMPLE_BRANCH.IND_CALL_SHIFT) +c.PERF_SAMPLE_BRANCH.ABORT_TX = bit.lshift(1, c.PERF_SAMPLE_BRANCH.ABORT_TX_SHIFT) +c.PERF_SAMPLE_BRANCH.IN_TX = bit.lshift(1, c.PERF_SAMPLE_BRANCH.IN_TX_SHIFT) +c.PERF_SAMPLE_BRANCH.NO_TX = bit.lshift(1, c.PERF_SAMPLE_BRANCH.NO_TX_SHIFT) +c.PERF_SAMPLE_BRANCH.COND = bit.lshift(1, c.PERF_SAMPLE_BRANCH.COND_SHIFT) +c.PERF_SAMPLE_BRANCH.CALL_STACK = bit.lshift(1, c.PERF_SAMPLE_BRANCH.CALL_STACK_SHIFT) +c.PERF_SAMPLE_BRANCH.IND_JUMP = bit.lshift(1, c.PERF_SAMPLE_BRANCH.IND_JUMP_SHIFT) +c.PERF_SAMPLE_BRANCH.CALL = bit.lshift(1, c.PERF_SAMPLE_BRANCH.CALL_SHIFT) +c.PERF_SAMPLE_BRANCH.NO_FLAGS = bit.lshift(1, c.PERF_SAMPLE_BRANCH.NO_FLAGS_SHIFT) +c.PERF_SAMPLE_BRANCH.NO_CYCLES = bit.lshift(1, c.PERF_SAMPLE_BRANCH.NO_CYCLES_SHIFT) + +-- Flags for perf_attr.read_format +c.PERF_READ_FORMAT = multiflags { + TOTAL_TIME_ENABLED = bit.lshift(1, 0), + TOTAL_TIME_RUNNING = bit.lshift(1, 1), + ID = bit.lshift(1, 2), + GROUP = bit.lshift(1, 3), +} + +-- Flags for perf_event_open +c.PERF_FLAG = multiflags { + FD_NO_GROUP = bit.lshift(1, 0), + FD_OUTPUT = bit.lshift(1, 1), + PID_CGROUP = bit.lshift(1, 2), + FD_CLOEXEC = bit.lshift(1, 3), +} + + +-- If perf_event_attr.sample_id_all is set then all event types will +-- have the sample_type selected fields related to where/when +-- (identity) an event took place (TID, TIME, ID, STREAM_ID, CPU, IDENTIFIER) +c.PERF_RECORD = strflag { + MMAP = 1, + LOST = 2, + COMM = 3, + EXIT = 4, + THROTTLE = 5, + UNTHROTTLE = 6, + FORK = 7, + READ = 8, + SAMPLE = 9, + MMAP2 = 10, + AUX = 11, + ITRACE_START = 12, + LOST_SAMPLES = 13, + SWITCH = 14, + SWITCH_CPU_WIDE= 15, } -- termios - c_cc characters diff --git a/lib/ljsyscall/syscall/linux/fcntl.lua b/lib/ljsyscall/syscall/linux/fcntl.lua index dd6621ffc6..67567c25be 100644 --- a/lib/ljsyscall/syscall/linux/fcntl.lua +++ b/lib/ljsyscall/syscall/linux/fcntl.lua @@ -22,6 +22,7 @@ local fcntl = { [c.F.GETLK] = t.flock, [c.F.SETLK] = t.flock, [c.F.SETLKW] = t.flock, + [c.F.ADD_SEALS] = function(arg) return c.F_SEAL[arg] end, }, ret = { [c.F.DUPFD] = function(ret) return t.fd(ret) end, @@ -33,6 +34,7 @@ local fcntl = { [c.F.GETSIG] = function(ret) return tonumber(ret) end, [c.F.GETPIPE_SZ] = function(ret) return tonumber(ret) end, [c.F.GETLK] = function(ret, arg) return arg end, + [c.F.GET_SEALS] = function(ret) return tonumber(ret) end, } } diff --git a/lib/ljsyscall/syscall/linux/ffi.lua b/lib/ljsyscall/syscall/linux/ffi.lua index 084fa7cdcf..2df6267494 100644 --- a/lib/ljsyscall/syscall/linux/ffi.lua +++ b/lib/ljsyscall/syscall/linux/ffi.lua @@ -498,10 +498,144 @@ struct sock_filter { uint8_t jf; uint32_t k; }; +struct bpf_insn { + uint8_t code; /* opcode */ + uint8_t dst_reg:4; /* dest register */ + uint8_t src_reg:4; /* source register */ + uint16_t off; /* signed offset */ + uint32_t imm; /* signed immediate constant */ +}; struct sock_fprog { unsigned short len; struct sock_filter *filter; }; +union bpf_attr { + struct { + uint32_t map_type; + uint32_t key_size; + uint32_t value_size; + uint32_t max_entries; + }; + struct { + uint32_t map_fd; + uint64_t key __attribute__((aligned(8))); + union { + uint64_t value __attribute__((aligned(8))); + uint64_t next_key __attribute__((aligned(8))); + }; + uint64_t flags; + }; + struct { + uint32_t prog_type; + uint32_t insn_cnt; + uint64_t insns __attribute__((aligned(8))); + uint64_t license __attribute__((aligned(8))); + uint32_t log_level; + uint32_t log_size; + uint64_t log_buf __attribute__((aligned(8))); + uint32_t kern_version; + }; + struct { + uint64_t pathname __attribute__((aligned(8))); + uint32_t bpf_fd; + uint32_t file_flags; + }; +} __attribute__((aligned(8))); +struct perf_event_attr { + uint32_t pe_type; + uint32_t size; + uint64_t pe_config; + union { + uint64_t sample_period; + uint64_t sample_freq; + }; + uint64_t pe_sample_type; + uint64_t read_format; + uint32_t disabled:1, + inherit:1, + pinned:1, + exclusive:1, + exclude_user:1, + exclude_kernel:1, + exclude_hv:1, + exclude_idle:1, + mmap:1, + comm:1, + freq:1, + inherit_stat:1, + enable_on_exec:1, + task:1, + watermark:1, + precise_ip:2, + mmap_data:1, + sample_id_all:1, + exclude_host:1, + exclude_guest:1, + exclude_callchain_kernel:1, + exclude_callchain_user:1, + mmap2:1, + comm_exec:1, + use_clockid:1, + __reserved_1a:6; + uint32_t __reserved_1b; + union { + uint32_t wakeup_events; + uint32_t wakeup_watermark; + }; + uint32_t bp_type; + union { + uint64_t bp_addr; + uint64_t config1; + }; + union { + uint64_t bp_len; + uint64_t config2; + }; + uint64_t branch_sample_type; + uint64_t sample_regs_user; + uint32_t sample_stack_user; + int32_t clockid; + uint64_t sample_regs_intr; + uint32_t aux_watermark; + uint32_t __reserved_2; +}; +struct perf_event_mmap_page { + uint32_t version; + uint32_t compat_version; + uint32_t lock; + uint32_t index; + int64_t offset; + uint64_t time_enabled; + uint64_t time_running; + union { + uint64_t capabilities; + struct { + uint32_t cap_bit0 : 1, + cap_bit0_is_deprecated : 1, + cap_user_rdpmc : 1, + cap_user_time : 1, + cap_user_time_zero : 1; + }; + }; + uint16_t pmc_width; + uint16_t time_shift; + uint32_t time_mult; + uint64_t time_offset; + uint64_t __reserved[120]; + volatile uint64_t data_head; + volatile uint64_t data_tail; + volatile uint64_t data_offset; + volatile uint64_t data_size; + uint64_t aux_head; + uint64_t aux_tail; + uint64_t aux_offset; + uint64_t aux_size; +}; +struct perf_event_header { + uint32_t type; + uint16_t misc; + uint16_t size; +}; struct mq_attr { long mq_flags, mq_maxmsg, mq_msgsize, mq_curmsgs, __unused[4]; }; @@ -703,6 +837,9 @@ struct rusage { long ru_nvcsw; long ru_nivcsw; }; +struct scm_timestamping { + struct timespec ts[3]; +}; ]] append(arch.nsig or [[ diff --git a/lib/ljsyscall/syscall/linux/ioctl.lua b/lib/ljsyscall/syscall/linux/ioctl.lua index 7ab871c92b..c695dca2c3 100644 --- a/lib/ljsyscall/syscall/linux/ioctl.lua +++ b/lib/ljsyscall/syscall/linux/ioctl.lua @@ -193,6 +193,7 @@ local ioctl = strflag { SIOCSIFFLAGS = 0x8914, SIOCGIFMTU = 0x8921, SIOCSIFMTU = 0x8922, + SIOCSIFHWADDR = 0x8924, SIOCGIFHWADDR = 0x8927, SIOCGIFINDEX = 0x8933, @@ -271,6 +272,16 @@ local ioctl = strflag { -- from linux/vfio.h type is ';' base is 100 VFIO_GET_API_VERSION = vfio('NONE', 0), VFIO_CHECK_EXTENSION = vfio('WRITE', 1, "uint32"), +-- from linux/perf_event.h + PERF_EVENT_IOC_ENABLE = _IO('$', 0), + PERF_EVENT_IOC_DISABLE = _IO('$', 1), + PERF_EVENT_IOC_REFRESH = _IO('$', 2), + PERF_EVENT_IOC_RESET = _IO('$', 3), + PERF_EVENT_IOC_PERIOD = _IOW('$', 4, "uint64"), + PERF_EVENT_IOC_SET_OUTPUT= _IO('$', 5), + PERF_EVENT_IOC_SET_FILTER= _IOW('$', 6, "uintptr"), + PERF_EVENT_IOC_ID = _IOR('$', 7, "uint64_1"), + PERF_EVENT_IOC_SET_BPF = _IOW('$', 8, "uint32"), -- allow user defined ioctls _IO = _IO, diff --git a/lib/ljsyscall/syscall/linux/nl.lua b/lib/ljsyscall/syscall/linux/nl.lua index fad5625ed8..a7da48a831 100644 --- a/lib/ljsyscall/syscall/linux/nl.lua +++ b/lib/ljsyscall/syscall/linux/nl.lua @@ -173,6 +173,10 @@ local rta_decode = { ir.cacheinfo = t.rta_cacheinfo() ffi.copy(ir.cacheinfo, buf, s.rta_cacheinfo) end, + [c.RTA.PREF] = function(ir, buf, len) + local i = pt.uint8(buf) + ir.pref = tonumber(i[0]) + end, -- TODO some missing } @@ -340,6 +344,44 @@ mt.iflink = { end } +meth.ndmsg = { + index = { + family = function(i) return tonumber(i.ndmsg.ndm_family) end, + ifindex = function(i) return tonumber(i.ndmsg.ndm_ifindex) end, + state = function(i) return tonumber(i.ndmsg.ndm_state) end, + flags = function(i) return tonumber(i.ndmsg.ndm_flags) end, + type = function(i) return tonumber(i.ndmsg.ndm_type) end, + dest = function(i) return i.dst or addrtype(i.family) end, + -- might not be set in Lua table, so return nil + dst = function() return nil end, + lladdr = function() return nil end, + }, + flags = { + [c.NTF.PROXY] = "proxy", + [c.NTF.ROUTER] = "router", + }, + state = { + [c.NUD.INCOMPLETE] = "incomplete", + [c.NUD.REACHABLE] = "reachable", + [c.NUD.STALE] = "stale", + [c.NUD.DELAY] = "delay", + [c.NUD.PROBE] = "probe", + [c.NUD.FAILED] = "failed", + [c.NUD.NOARP] = "noarp", + [c.NUD.PERMANENT] = "permanent", + } +} + +mt.ndmsg = { + __index = function(i, k) + if meth.ndmsg.index[k] then return meth.ndmsg.index[k](i) end + end, + __tostring = function(i) -- TODO make more like output of ip route + local s = "dst: " .. tostring(i.dest) .. " lladdr: " .. tostring(i.lladdr) .. " if: " .. i.ifindex + return s + end, +} + meth.rtmsg = { index = { family = function(i) return tonumber(i.rtmsg.rtm_family) end, @@ -378,6 +420,17 @@ mt.rtmsg = { end, } + +mt.neighs = { + __tostring = function(is) + local s = {} + for k, v in ipairs(is) do + s[#s + 1] = tostring(v) + end + return table.concat(s, '\n') + end, +} + meth.routes = { fn = { match = function(rs, addr, len) -- exact match @@ -496,12 +549,12 @@ local function decode_route(buf, len) end local function decode_neigh(buf, len) - local rt = pt.rtmsg(buf) - buf = buf + nlmsg_align(s.rtmsg) - len = len - nlmsg_align(s.rtmsg) + local rt = pt.ndmsg(buf) + buf = buf + nlmsg_align(s.ndmsg) + len = len - nlmsg_align(s.ndmsg) local rtattr = pt.rtattr(buf) - local ir = setmetatable({rtmsg = t.rtmsg()}, mt.rtmsg) - ffi.copy(ir.rtmsg, rt, s.rtmsg) + local ir = setmetatable({ndmsg = t.ndmsg()}, mt.ndmsg) + ffi.copy(ir.ndmsg, rt, s.ndmsg) while rta_ok(rtattr, len) do if nda_decode[rtattr.rta_type] then nda_decode[rtattr.rta_type](ir, buf + rta_length(0), rta_align(rtattr.rta_len) - rta_length(0)) @@ -1051,7 +1104,9 @@ function nl.getneigh(index, tab, ...) if type(index) == 'table' then index = index.index end tab.ifindex = index local ndm = t.ndmsg(tab) - return nlmsg("getneigh", "request, dump", ndm.family, t.ndmsg, ndm, ...) + local n, err = nlmsg("getneigh", "request, dump", ndm.family, t.ndmsg, ndm, ...) + if not n then return nil, err end + return setmetatable(n, mt.neighs) end function nl.newneigh(index, tab, ...) diff --git a/lib/ljsyscall/syscall/linux/ppc/nr.lua b/lib/ljsyscall/syscall/linux/ppc/nr.lua index 010fdb4c24..1d712ebf03 100644 --- a/lib/ljsyscall/syscall/linux/ppc/nr.lua +++ b/lib/ljsyscall/syscall/linux/ppc/nr.lua @@ -354,6 +354,15 @@ local nr = { setns = 350, process_vm_readv = 351, process_vm_writev = 352, + kcmp = 353, + finit_module = 354, + sched_setattr = 355, + sched_getattr = 356, + renameat2 = 357, + seccomp = 358, + getrandom = 359, + memfd_create = 360, + bpf = 361, } } diff --git a/lib/ljsyscall/syscall/linux/ppc64le/nr.lua b/lib/ljsyscall/syscall/linux/ppc64le/nr.lua index bd0df08fc1..0aa6ca6a01 100644 --- a/lib/ljsyscall/syscall/linux/ppc64le/nr.lua +++ b/lib/ljsyscall/syscall/linux/ppc64le/nr.lua @@ -349,6 +349,19 @@ local nr = { kcmp = 354, sched_setattr = 355, sched_getattr = 356, + renameat2 = 357, + seccomp = 358, + getrandom = 359, + memfd_create = 360, + bpf = 361, + execveat = 362, + switch_endian = 363, + userfaultfd = 364, + membarrier = 365, + mlock2 = 378, + copy_file_range = 379, + preadv2 = 380, + pwritev2 = 381, } } diff --git a/lib/ljsyscall/syscall/linux/syscalls.lua b/lib/ljsyscall/syscall/linux/syscalls.lua index 843e9e713e..8766481376 100644 --- a/lib/ljsyscall/syscall/linux/syscalls.lua +++ b/lib/ljsyscall/syscall/linux/syscalls.lua @@ -828,6 +828,172 @@ function S.sysctl(name, new) return old end +-- BPF syscall has a complex semantics with one union serving for all purposes +-- The interface exports both raw syscall and helper functions based on libbpf +if C.bpf then + local function ptr_to_u64(p) return ffi.cast('uint64_t', ffi.cast('void *', p)) end + function S.bpf(cmd, attr) + return C.bpf(c.BPF_CMD[cmd], attr) + end + function S.bpf_prog_load(type, insns, len, license, version, log_level) + if not license then license = "GPL" end -- Must stay alive during the syscall + local bpf_log_buf = ffi.new('char [?]', 64*1024) -- Must stay alive during the syscall + if not version then + -- We have no better way to extract current kernel hex-string other + -- than parsing headers, compiling a helper function or reading /proc + local ver_str, count = S.sysctl('kernel.osrelease'):match('%d+.%d+.%d+'), 2 + version = 0 + for i in ver_str:gmatch('%d+') do -- Convert 'X.Y.Z' to 0xXXYYZZ + version = bit.bor(version, bit.lshift(tonumber(i), 8*count)) + count = count - 1 + end + end + local attr = t.bpf_attr1() + attr[0].prog_type = c.BPF_PROG[type] + attr[0].insns = ptr_to_u64(insns) + attr[0].insn_cnt = len + attr[0].license = ptr_to_u64(license) + attr[0].log_buf = ptr_to_u64(bpf_log_buf) + attr[0].log_size = ffi.sizeof(bpf_log_buf) + attr[0].log_level = log_level or 1 + attr[0].kern_version = version -- MUST match current kernel version + local fd = S.bpf(c.BPF_CMD.PROG_LOAD, attr) + if fd < 0 then + return nil, t.error(errno()), ffi.string(bpf_log_buf) + end + return retfd(fd), ffi.string(bpf_log_buf) + end + function S.bpf_map_create(type, key_size, value_size, max_entries) + local attr = t.bpf_attr1() + attr[0].map_type = c.BPF_MAP[type] + attr[0].key_size = key_size + attr[0].value_size = value_size + attr[0].max_entries = max_entries + local fd = S.bpf(c.BPF_CMD.MAP_CREATE, attr) + if fd < 0 then + return nil, t.error(errno()) + end + return retfd(fd) + end + function S.bpf_map_op(op, fd, key, val_or_next, flags) + local attr = t.bpf_attr1() + attr[0].map_fd = getfd(fd) + attr[0].key = ptr_to_u64(key) + attr[0].value = ptr_to_u64(val_or_next) + attr[0].flags = flags or 0 + local ret = S.bpf(op, attr) + if ret ~= 0 then + return nil, t.error(errno()) + end + return ret + end + function S.bpf_obj_pin(path, fd, flags) + local attr = t.bpf_attr1() + local pathname = ffi.new("char[?]", #path+1) + ffi.copy(pathname, path) + attr[0].pathname = ptr_to_u64(pathname) + attr[0].bpf_fd = getfd(fd) + attr[0].file_flags = flags or 0 + local ret = S.bpf(c.BPF_CMD.OBJ_PIN, attr) + if ret ~= 0 then + return nil, t.error(errno()) + end + return ret + end + function S.bpf_obj_get(path, flags) + local attr = t.bpf_attr1() + local pathname = ffi.new("char[?]", #path+1) + ffi.copy(pathname, path) + attr[0].pathname = ptr_to_u64(pathname) + attr[0].file_flags = flags or 0 + local ret = S.bpf(c.BPF_CMD.OBJ_GET, attr) + if ret < 0 then + return nil, t.error(errno()) + end + return retfd(ret) + end +end + +-- Linux performance monitoring +if C.perf_event_open then + -- Open perf event fd + -- @note see man 2 perf_event_open + -- @return fd, err + function S.perf_event_open(attr, pid, cpu, group_fd, flags) + if attr[0].size == 0 then attr[0].size = ffi.sizeof(attr[0]) end + local fd = C.perf_event_open(attr, pid or 0, cpu or -1, group_fd or -1, c.PERF_FLAG[flags or 0]) + if fd < 0 then + return nil, t.error(errno()) + end + return retfd(fd) + end + -- Read the tracepoint configuration (see "/sys/kernel/debug/tracing/available_events") + -- @param event_path path to tracepoint (e.g. "/sys/kernel/debug/tracing/events/syscalls/sys_enter_write") + -- @return tp, err (e.g. 538, nil) + function S.perf_tracepoint(event_path) + local config = nil + event_path = event_path.."/id" + local fd, err = S.open(event_path, c.O.RDONLY) + if fd then + local ret, err = fd:read(nil, 256) + if ret then + config = tonumber(ret) + end + fd:close() + end + return config, err + end + -- Attach or detach a probe, same semantics as Lua tables. + -- See https://www.kernel.org/doc/Documentation/trace/kprobetrace.txt + -- (When the definition is not nil, it will be created, otherwise it will be detached) + -- @param probe_type either "kprobe" or "uprobe", no other probe types are supported + -- @param name chosen probe name (e.g. "myprobe") + -- @param definition (set to nil to disable probe) (e.g. "do_sys_open $retval") + -- @param retval true/false if this should be entrypoint probe or return probe + -- @return tp, err (e.g. 1099, nil) + function S.perf_probe(probe_type, name, definition, retval) + local event_path = string.format('/sys/kernel/debug/tracing/%s_events', probe_type) + local probe_path = string.format('/sys/kernel/debug/tracing/events/%ss/%s', probe_type, name) + -- Check if probe already exists + if definition and S.statfs(probe_path) then return nil, t.error(c.E.EEXIST) end + local fd, err = S.open(event_path, "wronly, append") + if not fd then return nil, err end + -- Format a probe definition + if not definition then + definition = "-:"..name -- Detach + else + definition = string.format("%s:%s %s", retval and "r" or "p", name, definition) + end + local ok, err = fd:write(definition) + fd:close() + -- Return tracepoint or success + if ok and definition then + return S.perf_tracepoint(probe_path) + end + return ok, err + end + -- Attach perf event reader to tracepoint (see "/sys/kernel/debug/tracing/available_events") + -- @param tp tracepoint identifier (e.g.: 538, use `S.perf_tracepoint()`) + -- @param type perf_attr.sample_type (default: "raw") + -- @param attrs table of attributes (e.g. {sample_type="raw, callchain"}, see `struct perf_event_attr`) + -- @return reader, err + function S.perf_attach_tracepoint(tp, pid, cpu, group_fd, attrs) + local pe = t.perf_event_attr1() + pe[0].type = "tracepoint" + pe[0].config = tp + pe[0].sample_type = "raw" + pe[0].sample_period = 1 + pe[0].wakeup_events = 1 + if attrs then + for k,v in pairs(attrs) do pe[0][k] = v end + end + -- Open perf event reader with given parameters + local fd, err = S.perf_event_open(pe, pid, cpu, group_fd, "fd_cloexec") + if not fd then return nil, err end + return t.perf_reader(fd) + end +end + return S end diff --git a/lib/ljsyscall/syscall/linux/types.lua b/lib/ljsyscall/syscall/linux/types.lua index 2af4ed5112..1ccd6b4356 100644 --- a/lib/ljsyscall/syscall/linux/types.lua +++ b/lib/ljsyscall/syscall/linux/types.lua @@ -115,6 +115,7 @@ local addstructs = { ff_rumble_effect = "struct ff_rumble_effect", ff_effect = "struct ff_effect", sock_fprog = "struct sock_fprog", + bpf_attr = "union bpf_attr", user_cap_header = "struct user_cap_header", user_cap_data = "struct user_cap_data", xt_get_revision = "struct xt_get_revision", @@ -128,6 +129,7 @@ local addstructs = { vhost_vring_addr = "struct vhost_vring_addr", vhost_memory_region = "struct vhost_memory_region", vhost_memory = "struct vhost_memory", + scm_timestamping = "struct scm_timestamping", } for k, v in pairs(addtypes) do addtype(types, k, v) end @@ -136,9 +138,12 @@ for k, v in pairs(addstructs) do addtype(types, k, v, lenmt) end -- these ones not in table as not helpful with vararg or arrays TODO add more addtype variants t.inotify_event = ffi.typeof("struct inotify_event") pt.inotify_event = ptt("struct inotify_event") -- still need pointer to this +pt.perf_event_header = ptt("struct perf_event_header") t.aio_context1 = ffi.typeof("aio_context_t[1]") t.sock_fprog1 = ffi.typeof("struct sock_fprog[1]") +t.bpf_attr1 = ffi.typeof("union bpf_attr[1]") +t.perf_event_attr1 = ffi.typeof("struct perf_event_attr[1]") t.user_cap_data2 = ffi.typeof("struct user_cap_data[2]") @@ -147,6 +152,8 @@ local iocbs = ffi.typeof("struct iocb[?]") t.iocbs = function(n, ...) return ffi.new(iocbs, n, ...) end local sock_filters = ffi.typeof("struct sock_filter[?]") t.sock_filters = function(n, ...) return ffi.new(sock_filters, n, ...) end +local bpf_insns = ffi.typeof("struct bpf_insn[?]") +t.bpf_insns = function(n, ...) return ffi.new(bpf_insns, n, ...) end local iocb_ptrs = ffi.typeof("struct iocb *[?]") t.iocb_ptrs = function(n, ...) return ffi.new(iocb_ptrs, n, ...) end @@ -760,6 +767,14 @@ mt.sock_filter = { addtype(types, "sock_filter", "struct sock_filter", mt.sock_filter) +mt.bpf_insn = { + __new = function(tp, code, dst_reg, src_reg, off, imm) + return ffi.new(tp, c.BPF[code], dst_reg or 0, src_reg or 0, off or 0, imm or 0) + end +} + +addtype(types, "bpf_insn", "struct bpf_insn", mt.bpf_insn) + -- capabilities data is an array so cannot put metatable on it. Also depends on version, so combine into one structure. -- TODO maybe add caching @@ -1231,6 +1246,23 @@ mt.mmsghdrs = { addtype_var(types, "mmsghdrs", "struct {int count; struct mmsghdr msg[?];}", mt.mmsghdrs) +addtype(types, "bpf_attr", "union bpf_attr") + +-- Metatype for Linux perf events +mt.perf_event_attr = { + index = { + type = function(self) return self.pe_type end, + config = function(self) return self.pe_config end, + sample_type = function(self) return self.pe_sample_type end, + }, + newindex = { + type = function(self, v) self.pe_type = c.PERF_TYPE[v] end, + config = function(self, v) self.pe_config = c.PERF_COUNT[v] end, + sample_type = function(self, v) self.pe_sample_type = c.PERF_SAMPLE[v] end, + }, +} +addtype(types, "perf_event_attr", "struct perf_event_attr", mt.perf_event_attr) + -- this is declared above samap_pt = { [c.AF.UNIX] = pt.sockaddr_un, diff --git a/lib/ljsyscall/syscall/linux/util.lua b/lib/ljsyscall/syscall/linux/util.lua index 76a1a86748..a46aeafc8e 100644 --- a/lib/ljsyscall/syscall/linux/util.lua +++ b/lib/ljsyscall/syscall/linux/util.lua @@ -46,8 +46,11 @@ function util.if_nametoindex(name) -- standard function in some libc versions local s, err = S.socket(c.AF.LOCAL, c.SOCK.STREAM, 0) if not s then return nil, err end local i, err = if_nametoindex(name, s) - if not i then return nil, err end - local ok, err = s:close() + if not i then + S.close(s) + return nil, err + end + local ok, err = S.close(s) if not ok then return nil, err end return i end diff --git a/lib/ljsyscall/syscall/linux/x64/nr.lua b/lib/ljsyscall/syscall/linux/x64/nr.lua index 7309565435..0a91a2d2c1 100644 --- a/lib/ljsyscall/syscall/linux/x64/nr.lua +++ b/lib/ljsyscall/syscall/linux/x64/nr.lua @@ -323,6 +323,7 @@ local nr = { getrandom = 318, memfd_create = 319, kexec_file_load = 320, + bpf = 321, } } diff --git a/lib/ljsyscall/syscall/linux/x86/nr.lua b/lib/ljsyscall/syscall/linux/x86/nr.lua index 9757aa2e6a..deb7551239 100644 --- a/lib/ljsyscall/syscall/linux/x86/nr.lua +++ b/lib/ljsyscall/syscall/linux/x86/nr.lua @@ -350,6 +350,7 @@ local nr = { seccomp = 354, getrandom = 355, memfd_create = 356, + bpf = 357, } } diff --git a/lib/ljsyscall/syscall/methods.lua b/lib/ljsyscall/syscall/methods.lua index fb2dcd36f5..51b04e51bf 100644 --- a/lib/ljsyscall/syscall/methods.lua +++ b/lib/ljsyscall/syscall/methods.lua @@ -205,6 +205,89 @@ t.timer = metatype("struct {timer_t timerid[1];}", { --__gc = S.timer_delete, }) +if abi.os == "linux" then + -- Linux performance monitoring reader + t.perf_reader = metatype("struct {int fd; char *map; size_t map_pages; }", { + __new = function (ct, fd) + if not fd then return ffi.new(ct) end + if istype(t.fd, fd) then fd = fd:nogc():getfd() end + return ffi.new(ct, fd) + end, + __len = function(t) return ffi.sizeof(t) end, + __gc = function (t) t:close() end, + __index = { + close = function(t) + t:munmap() + if t.fd > 0 then S.close(t.fd) end + end, + munmap = function (t) + if t.map_pages > 0 then + S.munmap(t.map, (t.map_pages + 1) * S.getpagesize()) + t.map_pages = 0 + end + end, + -- read(2) interface, see `perf_attr.read_format` + -- @return u64 or an array of u64 + read = function (t, len) + local rvals = ffi.new('uint64_t [4]') + local nb, err = S.read(t.fd, rvals, len or ffi.sizeof(rvals)) + if not nb then return nil, err end + return nb == 8 and rvals[0] or rvals + end, + -- mmap(2) interface, see sampling interface (`perf_attr.sample_type` and `perf_attr.mmap`) + -- first page is metadata page, the others are sample_type dependent + mmap = function (t, pages) + t:munmap() + pages = pages or 8 + local map, err = S.mmap(nil, (pages + 1) * S.getpagesize(), "read, write", "shared", t.fd, 0) + if not map then return nil, err end + t.map = map + t.map_pages = pages + return pages + end, + meta = function (t) + return t.map_pages > 0 and ffi.cast("struct perf_event_mmap_page *", t.map) or nil + end, + -- next() function for __ipairs returning (len, event) pairs + -- it only retires read events when current event length is passed + next = function (t, curlen) + local buffer_size = S.getpagesize() * t.map_pages + local base = t.map + S.getpagesize() + local meta = t:meta() + -- Retire last read event or start iterating + if curlen then + meta.data_tail = meta.data_tail + curlen + end + -- End of ring buffer, yield + -- TODO: + if meta.data_head == meta.data_tail then + return + end + local e = pt.perf_event_header(base + (meta.data_tail % buffer_size)) + local e_end = base + (meta.data_tail + e.size) % buffer_size; + -- If the perf event wraps around the ring, we need to make a contiguous copy + if ffi.cast("uintptr_t", e_end) < ffi.cast("uintptr_t", e) then + local tmp_e = ffi.new("char [?]", e.size) + local len = (base + buffer_size) - ffi.cast('char *', e) + ffi.copy(tmp_e, e, len) + ffi.copy(tmp_e + len, base, e.size - len) + e = ffi.cast(ffi.typeof(e), tmp_e) + end + return e.size, e + end, + -- Various ioctl() wrappers + ioctl = function(t, cmd, val) return S.ioctl(t.fd, cmd, val or 0) end, + start = function(t) return t:ioctl("PERF_EVENT_IOC_ENABLE") end, + stop = function(t) return t:ioctl("PERF_EVENT_IOC_DISABLE") end, + refresh = function(t) return t:ioctl("PERF_EVENT_IOC_REFRESH") end, + reset = function(t) return t:ioctl("PERF_EVENT_IOC_RESET") end, + setfilter = function(t, val) return t:ioctl("PERF_EVENT_IOC_SET_FILTER", val) end, + setbpf = function(t, fd) return t:ioctl("PERF_EVENT_IOC_SET_BPF", pt.void(fd)) end, + }, + __ipairs = function(t) return t.next, t, nil end + }) +end + -- TODO reinstate this, more like fd is, hence changes to destroy --[[ t.aio_context = metatype("struct {aio_context_t ctx;}", { diff --git a/lib/ljsyscall/syscall/netbsd/ffifunctions.lua b/lib/ljsyscall/syscall/netbsd/ffifunctions.lua index 2a6741b425..25b32278ca 100644 --- a/lib/ljsyscall/syscall/netbsd/ffifunctions.lua +++ b/lib/ljsyscall/syscall/netbsd/ffifunctions.lua @@ -78,5 +78,7 @@ int __nanosleep50(const struct timespec *req, struct timespec *rem); int __timer_settime50(timer_t timerid, int flags, const struct itimerspec *new_value, struct itimerspec * old_value); int __timer_gettime50(timer_t timerid, struct itimerspec *curr_value); int __adjtime50(const struct timeval *delta, struct timeval *olddelta); + +int fstatat(int dirfd, const char *pathname, struct stat *buf, int flags); ]] diff --git a/lib/ljsyscall/syscall/openbsd/ffi.lua b/lib/ljsyscall/syscall/openbsd/ffi.lua index d89b277f27..a53fc2b5b2 100644 --- a/lib/ljsyscall/syscall/openbsd/ffi.lua +++ b/lib/ljsyscall/syscall/openbsd/ffi.lua @@ -295,6 +295,7 @@ struct sigaction { append [[ int reboot(int howto); int ioctl(int d, unsigned long request, void *arg); +int fstatat(int dirfd, const char *pathname, struct stat *buf, int flags); /* not syscalls, but using for now */ int grantpt(int fildes); diff --git a/lib/ljsyscall/syscall/osx/c.lua b/lib/ljsyscall/syscall/osx/c.lua index 08d6a0339e..82d077f53b 100644 --- a/lib/ljsyscall/syscall/osx/c.lua +++ b/lib/ljsyscall/syscall/osx/c.lua @@ -42,6 +42,7 @@ local C = setmetatable({}, { }) -- new stat structure, else get legacy one; could use syscalls instead +-- does not work for fstatat C.stat = C.stat64 C.fstat = C.fstat64 C.lstat = C.lstat64 @@ -56,7 +57,7 @@ function C.getdirentries(fd, buf, len, basep) end ]] --- cannot find these anywhere! +-- cannot find these anywhere! Apparently not there since 64 bit inodes? --C.getdirentries = ffi.C._getdirentries --C.sigaction = ffi.C._sigaction diff --git a/lib/ljsyscall/syscall/osx/constants.lua b/lib/ljsyscall/syscall/osx/constants.lua index 9a6ec0714d..40108a2003 100644 --- a/lib/ljsyscall/syscall/osx/constants.lua +++ b/lib/ljsyscall/syscall/osx/constants.lua @@ -1109,5 +1109,27 @@ c.CLOCKTYPE = { c.CLOCKTYPE.REALTIME = c.CLOCKTYPE.SYSTEM -return c +c.CLOCK = strflag { + REALTIME = 0, + MONOTONIC_RAW = 4, + MONOTONIC_RAW_APPROX = 5, + MONOTONIC = 6, + UPTIME_RAW = 8, + UPTIME_RAW_APPROX = 9, + PROCESS_CPUTIME_ID = 12, + THREAD_CPUTIME_ID = 16, +} + +-- AT constants only in recent versions, should check when added +c.AT_FDCWD = atflag { + FDCWD = -2, +} +c.AT = multiflags { + EACCESS = 0x0010, + SYMLINK_NOFOLLOW = 0x0020, + SYMLINK_FOLLOW = 0x0040, + REMOVEDIR = 0x0080, +} + +return c diff --git a/lib/ljsyscall/syscall/osx/ffi.lua b/lib/ljsyscall/syscall/osx/ffi.lua index ae6fedc1b0..831e5621bf 100644 --- a/lib/ljsyscall/syscall/osx/ffi.lua +++ b/lib/ljsyscall/syscall/osx/ffi.lua @@ -30,7 +30,8 @@ typedef int64_t blkcnt_t; typedef int32_t blksize_t; typedef int32_t suseconds_t; typedef uint16_t nlink_t; -typedef uint64_t ino_t; // at least on recent desktop; TODO define as ino64_t +typedef uint64_t ino64_t; +typedef uint32_t ino_t; typedef long time_t; typedef int32_t daddr_t; typedef unsigned long clock_t; @@ -158,7 +159,7 @@ struct stat { dev_t st_dev; mode_t st_mode; nlink_t st_nlink; - ino_t st_ino; + ino64_t st_ino; uid_t st_uid; gid_t st_gid; dev_t st_rdev; @@ -174,6 +175,25 @@ struct stat { int32_t st_lspare; int64_t st_qspare[2]; }; +struct stat32 { + dev_t st_dev; + ino_t st_ino; + mode_t st_mode; + nlink_t st_nlink; + uid_t st_uid; + gid_t st_gid; + dev_t st_rdev; + struct timespec st_atimespec; + struct timespec st_mtimespec; + struct timespec st_ctimespec; + off_t st_size; + blkcnt_t st_blocks; + blksize_t st_blksize; + uint32_t st_flags; + uint32_t st_gen; + int32_t st_lspare; + int64_t st_qspare[2]; +}; union sigval { int sival_int; void *sival_ptr; @@ -292,6 +312,7 @@ int mount(const char *type, const char *dir, int flags, void *data); int stat64(const char *path, struct stat *sb); int lstat64(const char *path, struct stat *sb); int fstat64(int fd, struct stat *sb); +int fstatat(int dirfd, const char *pathname, struct stat32 *buf, int flags); int _getdirentries(int fd, char *buf, int nbytes, long *basep); int _sigaction(int signum, const struct sigaction *act, struct sigaction *oldact); diff --git a/lib/ljsyscall/syscall/osx/syscalls.lua b/lib/ljsyscall/syscall/osx/syscalls.lua index 47d7918db7..67949c1905 100644 --- a/lib/ljsyscall/syscall/osx/syscalls.lua +++ b/lib/ljsyscall/syscall/osx/syscalls.lua @@ -53,6 +53,14 @@ function S.clock_get_time(clock_serv, cur_time) return cur_time end +-- cannot find out how to get new stat type from fstatat +function S.fstatat(fd, path, buf, flags) + if not buf then buf = t.stat32() end + local ret, err = C.fstatat(c.AT_FDCWD[fd], path, buf, c.AT[flags]) + if ret == -1 then return nil, t.error(err or errno()) end + return buf +end + return S end diff --git a/lib/ljsyscall/syscall/osx/types.lua b/lib/ljsyscall/syscall/osx/types.lua index 4b3304cde0..204ad5aecd 100644 --- a/lib/ljsyscall/syscall/osx/types.lua +++ b/lib/ljsyscall/syscall/osx/types.lua @@ -120,6 +120,9 @@ end addtype(types, "stat", "struct stat", mt.stat) +-- for fstatat where we can'tseem to get 64 bit version at present +addtype(types, "stat32", "struct stat32", mt.stat) + local signames = {} local duplicates = {LWT = true, IOT = true, CLD = true, POLL = true} for k, v in pairs(c.SIG) do diff --git a/lib/ljsyscall/syscall/syscalls.lua b/lib/ljsyscall/syscall/syscalls.lua index 33051be18a..c6d3417028 100644 --- a/lib/ljsyscall/syscall/syscalls.lua +++ b/lib/ljsyscall/syscall/syscalls.lua @@ -327,7 +327,9 @@ function S.getsockopt(fd, level, optname, optval, optlen) local ret, err = C.getsockopt(getfd(fd), c.SOL[level], c.SO[optname], optval, len) if ret == -1 then return nil, t.error(err or errno()) end if len[0] ~= optlen then error("incorrect optlen for getsockopt: set " .. optlen .. " got " .. len[0]) end - return optval[0] -- TODO will not work if struct, eg see netfilter + local ok, ret = pcall(function () return optval[0] end) + if ok then return ret + else return optval end end function S.bind(sockfd, addr, addrlen) local saddr = pt.sockaddr(addr) @@ -428,6 +430,7 @@ function S.getpid() return C.getpid() end function S.getppid() return C.getppid() end function S.getgid() return C.getgid() end function S.getegid() return C.getegid() end +function S.gettid() return C.gettid() end function S.setuid(uid) return retbool(C.setuid(uid)) end function S.setgid(gid) return retbool(C.setgid(gid)) end function S.seteuid(uid) return retbool(C.seteuid(uid)) end diff --git a/lib/ljsyscall/test/bsd.lua b/lib/ljsyscall/test/bsd.lua index 70cba01b39..5fcb4a3b74 100644 --- a/lib/ljsyscall/test/bsd.lua +++ b/lib/ljsyscall/test/bsd.lua @@ -2,7 +2,7 @@ local function init(S) -local helpers = require "syscall.helpers" +local helpers = require "test.helpers" local types = S.types local c = S.c local abi = S.abi @@ -89,7 +89,9 @@ test.filesystem_bsd = { test_chflags = function() local fd = assert(S.creat(tmpfile, "RWXU")) assert(fd:write("append")) - assert(S.chflags(tmpfile, "uf_append")) + local ok, err = S.chflags(tmpfile, "uf_append") + if not ok and err.OPNOTSUPP then error "skipped" end + assert(ok, err) assert(fd:write("append")) assert(fd:seek(0, "set")) local n, err = fd:write("not append") @@ -102,7 +104,9 @@ test.filesystem_bsd = { if not S.lchflags then error "skipped" end local fd = assert(S.creat(tmpfile, "RWXU")) assert(fd:write("append")) - assert(S.lchflags(tmpfile, "uf_append")) + local ok, err = S.lchflags(tmpfile, "uf_append") + if not ok and err.OPNOTSUPP then error "skipped" end + assert(ok, err) assert(fd:write("append")) assert(fd:seek(0, "set")) local n, err = fd:write("not append") @@ -114,7 +118,9 @@ test.filesystem_bsd = { test_fchflags = function() local fd = assert(S.creat(tmpfile, "RWXU")) assert(fd:write("append")) - assert(fd:chflags("uf_append")) + local ok, err = fd:chflags("uf_append") + if not ok and err.OPNOTSUPP then error "skipped" end + assert(ok, err) assert(fd:write("append")) assert(fd:seek(0, "set")) local n, err = fd:write("not append") @@ -127,7 +133,9 @@ test.filesystem_bsd = { if not S.chflagsat then error "skipped" end local fd = assert(S.creat(tmpfile, "RWXU")) assert(fd:write("append")) - assert(S.chflagsat("fdcwd", tmpfile, "uf_append", "symlink_nofollow")) + local ok, err = S.chflagsat("fdcwd", tmpfile, "uf_append", "symlink_nofollow") + if not ok and err.OPNOTSUPP then error "skipped" end + assert(ok, err) assert(fd:write("append")) assert(fd:seek(0, "set")) local n, err = fd:write("not append") @@ -258,7 +266,8 @@ test.bsd_extattr = { assert(S.unlink(tmpfile)) local n, err = fd:extattr_get("user", "myattr", false) -- false does raw call with no buffer to return length if not n and err.OPNOTSUPP then error "skipped" end -- fs does not support extattr - assert(not n and err.NOATTR) + assert(not n, "expected to fail") + assert(err.NOATTR, err) assert(fd:close()) end, test_extattr_getsetdel_fd = function() @@ -267,7 +276,8 @@ test.bsd_extattr = { assert(S.unlink(tmpfile)) local n, err = fd:extattr_get("user", "myattr", false) -- false does raw call with no buffer to return length if not n and err.OPNOTSUPP then error "skipped" end -- fs does not support extattr - assert(not n and err.NOATTR) + assert(not n, "expected to fail") + assert(err.NOATTR, err) local n, err = fd:extattr_set("user", "myattr", "myvalue") if not n and err.OPNOTSUPP then error "skipped" end -- fs does not support setting extattr assert(n, err) diff --git a/lib/ljsyscall/test/ctest-linux.lua b/lib/ljsyscall/test/ctest-linux.lua index 4433013fcd..a531542231 100644 --- a/lib/ljsyscall/test/ctest-linux.lua +++ b/lib/ljsyscall/test/ctest-linux.lua @@ -66,6 +66,81 @@ ctypes["struct termios"] = nil -- not defined by glibc ctypes["struct k_sigaction"] = nil +-- eBPF not available on Travis / opaque types +ctypes["struct bpf_insn"] = nil +ctypes["union bpf_attr"] = nil +c.BPF_MAP = {} +c.BPF_CMD = {} +c.BPF_PROG = {} +c.BPF_ATTACH_TYPE = {} +c.BPF.ALU64 = nil +c.BPF.DW = nil +c.BPF.JSGT = nil +c.BPF.JSGE = nil +c.BPF.CALL = nil +c.BPF.EXIT = nil +c.BPF.TO_LE = nil +c.BPF.TO_BE = nil +c.BPF.ANY = nil +c.BPF.NOEXIST = nil +c.BPF.EXIST = nil +c.BPF.END = nil +c.BPF.ARSH = nil +c.BPF.XADD = nil +c.BPF.JNE = nil +c.BPF.MOV = nil +c.SYS.bpf = nil + +-- no perf_event_open on Travis CI +ctypes["struct perf_event_attr"] = nil +ctypes["struct perf_event_reader"] = nil +ctypes["struct perf_event_header"] = nil +ctypes["struct perf_event_mmap_page"] = nil +c.PERF_TYPE = {} +c.PERF_COUNT = {} +c.PERF_SAMPLE = {} +c.PERF_FLAG = {} +c.PERF_SAMPLE_REGS = {} +c.PERF_SAMPLE_BRANCH = {} +c.PERF_READ_FORMAT = {} +c.PERF_RECORD = {} +-- no perf_event_open ioctls on Travis CI +c.IOCTL.PERF_EVENT_IOC_ENABLE = nil +c.IOCTL.PERF_EVENT_IOC_DISABLE = nil +c.IOCTL.PERF_EVENT_IOC_REFRESH = nil +c.IOCTL.PERF_EVENT_IOC_RESET = nil +c.IOCTL.PERF_EVENT_IOC_PERIOD = nil +c.IOCTL.PERF_EVENT_IOC_SET_OUTPUT = nil +c.IOCTL.PERF_EVENT_IOC_SET_FILTER = nil +c.IOCTL.PERF_EVENT_IOC_ID = nil +c.IOCTL.PERF_EVENT_IOC_SET_BPF = nil + +-- not in kernel headers used by Travis CI +ctypes["struct scm_timestamping"] = nil +c.SCM.TSTAMP_ACK = nil +c.SCM.TSTAMP_SCHED = nil +c.SCM.TSTAMP_SND = nil +c.SCM.TIMESTAMPING_OPT_STATS = nil + +-- not in kernel headers used by Travis CI +c.SOF.TIMESTAMPING_LAST = nil +c.SOF.TIMESTAMPING_MASK = nil +c.SOF.TIMESTAMPING_OPT_CMSG = nil +c.SOF.TIMESTAMPING_OPT_ID = nil +c.SOF.TIMESTAMPING_OPT_PKTINFO = nil +c.SOF.TIMESTAMPING_OPT_STATS = nil +c.SOF.TIMESTAMPING_OPT_TSONLY = nil +c.SOF.TIMESTAMPING_OPT_TX_SWHW = nil +c.SOF.TIMESTAMPING_RAW_HARDWARE = nil +c.SOF.TIMESTAMPING_RX_HARDWARE = nil +c.SOF.TIMESTAMPING_RX_SOFTWARE = nil +c.SOF.TIMESTAMPING_SOFTWARE = nil +c.SOF.TIMESTAMPING_SYS_HARDWARE = nil +c.SOF.TIMESTAMPING_TX_ACK = nil +c.SOF.TIMESTAMPING_TX_HARDWARE = nil +c.SOF.TIMESTAMPING_TX_SCHED = nil +c.SOF.TIMESTAMPING_TX_SOFTWARE = nil + if abi.arch == "arm" then ctypes["struct statfs64"] = nil end -- padding difference, not that important for k, v in pairs(c.IOCTL) do if type(v) == "table" then c.IOCTL[k] = v.number end end @@ -223,6 +298,10 @@ c.TCP.QUEUE_SEQ = nil c.TCP.TIMESTAMP = nil c.TCP.USER_TIMEOUT = nil c.TCP.REPAIR_QUEUE = nil +c.RTA.NEWDST = nil +c.RTA.PREF = nil +c.RTA.VIA = nil +c.RTA.MFC_STATS = nil -- these are not in Musl at present TODO send patches to get them in c.IPPROTO.UDPLITE = nil @@ -279,6 +358,23 @@ c.SO.PEEK_OFF = nil c.SO.GET_FILTER = nil c.SO.NOFCS = nil c.SO.WIFI_STATUS = nil +c.SO.REUSEPORT = nil +c.SO.LOCK_FILTER = nil +c.SO.SELECT_ERR_QUEUE = nil +c.SO.BUSY_POLL = nil +c.SO.MAX_PACING_RATE = nil +c.SO.BPF_EXTENSIONS = nil +c.SO.INCOMING_CPU = nil +c.SO.ATTACH_BPF = nil +c.SO.DETACH_BPF = nil +c.SO.ATTACH_REUSEPORT_CBPF = nil +c.SO.ATTACH_REUSEPORT_EBPF = nil + +-- new fcntl +c.F.CANCELLK = nil +c.F.ADD_SEALS = nil +c.F.GET_SEALS = nil +c.F_SEAL = nil -- Musl changes some of the syscall constants in its 32/64 bit handling c.SYS.getdents = nil @@ -318,6 +414,8 @@ c.CBAUDEX = nil -- missing on my mips box c.AUDIT_ARCH.H8300 = nil +-- missing on CI +c.AUDIT_ARCH.AARCH64 = nil -- defined only in linux/termios.h which we cannot include on mips c.TIOCM.OUT1 = nil diff --git a/lib/ljsyscall/test/freebsd.lua b/lib/ljsyscall/test/freebsd.lua index 4a57af07ad..e296017f4b 100644 --- a/lib/ljsyscall/test/freebsd.lua +++ b/lib/ljsyscall/test/freebsd.lua @@ -2,7 +2,7 @@ local function init(S) -local helpers = require "syscall.helpers" +local helpers = require "test.helpers" local types = S.types local c = S.c local abi = S.abi diff --git a/lib/ljsyscall/test/helpers.lua b/lib/ljsyscall/test/helpers.lua new file mode 100644 index 0000000000..6f29c0a1d3 --- /dev/null +++ b/lib/ljsyscall/test/helpers.lua @@ -0,0 +1,81 @@ +-- misc helper functions + +local require, error, assert, tonumber, tostring, +setmetatable, pairs, ipairs, unpack, rawget, rawset, +pcall, type, table, string, math = +require, error, assert, tonumber, tostring, +setmetatable, pairs, ipairs, unpack, rawget, rawset, +pcall, type, table, string, math + +local debug, collectgarbage = require "debug", collectgarbage + +local ffi = require "ffi" +local bit = require "bit" + +local h = {} + +-- generic assert helper, mainly for tests +function h.assert(cond, err, ...) + if not cond then + error(tostring(err or "unspecified error")) -- annoyingly, assert does not call tostring! + end + collectgarbage("collect") -- force gc, to test for bugs + if type(cond) == "function" then return cond, err, ... end + if cond == true then return ... end + return cond, ... +end + +-- endian conversion +if ffi.abi("be") then -- nothing to do + function h.htonl(b) return b end + function h.htons(b) return b end + function h.convle32(b) return bit.bswap(b) end -- used by file system capabilities, always stored as le +else + function h.htonl(b) return bit.bswap(b) end + function h.htons(b) return bit.rshift(bit.bswap(b), 16) end + function h.convle32(b) return b end -- used by file system capabilities, always stored as le +end +h.ntohl = h.htonl -- reverse is the same +h.ntohs = h.htons -- reverse is the same + +function h.octal(s) return tonumber(s, 8) end +local octal = h.octal + +function h.split(delimiter, text) + if delimiter == "" then return {text} end + if #text == 0 then return {} end + local list = {} + local pos = 1 + while true do + local first, last = text:find(delimiter, pos) + if first then + list[#list + 1] = text:sub(pos, first - 1) + pos = last + 1 + else + list[#list + 1] = text:sub(pos) + break + end + end + return list +end + +function h.trim(s) -- TODO should replace underscore with space + return (s:gsub("^%s*(.-)%s*$", "%1")) +end + +local split, trim = h.split, h.trim + +h.divmod = function(a, b) + return math.floor(a / b), a % b +end + +h.booltoc = setmetatable({ + [0] = 0, + [1] = 1, + [false] = 0, + [true] = 1, +}, {__call = function(tb, arg) return tb[arg or 0] end}) -- allow nil as false + +function h.ctobool(i) return tonumber(i) ~= 0 end + +return h diff --git a/lib/ljsyscall/test/linux-constants.lua b/lib/ljsyscall/test/linux-constants.lua index cb2de6e968..b25947f639 100644 --- a/lib/ljsyscall/test/linux-constants.lua +++ b/lib/ljsyscall/test/linux-constants.lua @@ -155,6 +155,21 @@ local function fixup_constants(abi, c) c.SECCOMP_MODE = nil c.SECCOMP_RET = nil c.MFD = nil + c.RTA.NEWDST = nil + c.RTA.PREF = nil + c.RTA.VIA = nil + c.RTA.MFC_STATS = nil + c.AUDIT_ARCH.AARCH64 = nil + c.SO.MAX_PACING_RATE = nil + c.SO.BPF_EXTENSIONS = nil + c.SO.INCOMING_CPU = nil + c.SO.ATTACH_BPF = nil + c.SO.DETACH_BPF = nil + c.SO.ATTACH_REUSEPORT_CBPF = nil + c.SO.ATTACH_REUSEPORT_EBPF = nil + c.F_SEAL = nil + c.F.ADD_SEALS = nil + c.F.GET_SEALS = nil -- these are not even in linux git head headers or names wrong c.O.ASYNC = nil @@ -202,9 +217,58 @@ local function fixup_constants(abi, c) c.SYS.getrandom = nil c.SYS.memfd_create = nil c.SYS.kexec_file_load = nil + c.SYS.bpf = nil -- new constants c.GRND = nil + -- requires Linux 3.19+, not supported on Travis + c.BPF_MAP = {} + c.BPF_CMD = {} + c.BPF_PROG = {} + c.BPF_ATTACH_TYPE = {} + c.BPF.ALU64 = nil + c.BPF.DW = nil + c.BPF.JSGT = nil + c.BPF.JSGE = nil + c.BPF.CALL = nil + c.BPF.EXIT = nil + c.BPF.TO_LE = nil + c.BPF.TO_BE = nil + c.BPF.END = nil + c.BPF.ARSH = nil + c.BPF.XADD = nil + c.BPF.JNE = nil + c.BPF.MOV = nil + c.BPF.ANY = nil + c.BPF.EXIST = nil + c.BPF.NOEXIST = nil + -- no perf_event_open on Travis CI + c.PERF_TYPE = {} + c.PERF_COUNT = {} + c.PERF_SAMPLE = {} + c.PERF_FLAG = {} + c.PERF_SAMPLE_REGS = {} + c.PERF_SAMPLE_BRANCH = {} + c.PERF_READ_FORMAT = {} + c.PERF_RECORD = {} + + c.SOF.TIMESTAMPING_LAST = nil + c.SOF.TIMESTAMPING_MASK = nil + c.SOF.TIMESTAMPING_OPT_CMSG = nil + c.SOF.TIMESTAMPING_OPT_ID = nil + c.SOF.TIMESTAMPING_OPT_PKTINFO = nil + c.SOF.TIMESTAMPING_OPT_STATS = nil + c.SOF.TIMESTAMPING_OPT_TSONLY = nil + c.SOF.TIMESTAMPING_OPT_TX_SWHW = nil + c.SOF.TIMESTAMPING_RAW_HARDWARE = nil + c.SOF.TIMESTAMPING_RX_HARDWARE = nil + c.SOF.TIMESTAMPING_RX_SOFTWARE = nil + c.SOF.TIMESTAMPING_SOFTWARE = nil + c.SOF.TIMESTAMPING_SYS_HARDWARE = nil + c.SOF.TIMESTAMPING_TX_ACK = nil + c.SOF.TIMESTAMPING_TX_HARDWARE = nil + c.SOF.TIMESTAMPING_TX_SCHED = nil + c.SOF.TIMESTAMPING_TX_SOFTWARE = nil return c end diff --git a/lib/ljsyscall/test/linux-structures.lua b/lib/ljsyscall/test/linux-structures.lua index 811d723dcb..0ab4a68a6c 100644 --- a/lib/ljsyscall/test/linux-structures.lua +++ b/lib/ljsyscall/test/linux-structures.lua @@ -28,6 +28,7 @@ local function fixup_structs(abi, ctypes) ctypes["struct capabilities"] = nil ctypes["struct cap"] = nil ctypes["struct {dev_t dev;}"] = nil + ctypes["struct perf_event_reader"] = nil -- standard headers use __kernel types for these or just fixed sizes ctypes.ino_t = nil @@ -68,7 +69,12 @@ local function fixup_structs(abi, ctypes) ctypes["struct sockaddr_storage"] = nil -- uses __kernel_ ctypes["struct k_sigaction"] = nil -- seems to be incorrect in headers ctypes["struct mmsghdr"] = nil -- too new for our headers - + ctypes["union bpf_attr"] = nil -- too new for our headers + ctypes["struct bpf_insn"] = nil -- too new for our headers + ctypes["struct perf_event_attr"] = nil -- too new for our headers + ctypes["struct perf_event_header"] = nil -- too new for our headers + ctypes["struct perf_event_mmap_page"] = nil -- too new for our headers + ctypes["struct scm_timestamping"] = nil -- too new for our headers ctypes["sigset_t"] = nil -- still some issues return ctypes diff --git a/lib/ljsyscall/test/linux.lua b/lib/ljsyscall/test/linux.lua index 40e4d48506..953e5ea859 100644 --- a/lib/ljsyscall/test/linux.lua +++ b/lib/ljsyscall/test/linux.lua @@ -2,7 +2,7 @@ local function init(S) -local helpers = require "syscall.helpers" +local helpers = require "test.helpers" local abi = S.abi local types = S.types local c = S.c @@ -269,7 +269,9 @@ test.misc_linux = { end, ]] test_adjtimex = function() - local tt = assert(S.adjtimex()) + local tt, err = S.adjtimex() + if not tt and err.PERM then error "skipped" end + assert(tt, err) end, test_prctl = function() local n @@ -346,9 +348,12 @@ test.misc_linux = { end, test_memfd = function() if not S.memfd_create then error "skipped" end - local fd, err = S.memfd_create("", "cloexec") + local fd, err = S.memfd_create("", "cloexec, allow_sealing") if not fd and err.NOSYS then error "skipped" end assert(fd, err) + local seals = assert(fd:fcntl("get_seals")) + assert(seals == 0) + assert(fd:fcntl("add_seals", "shrink, grow, write, seal")) assert(fd:close()) end, } @@ -431,10 +436,6 @@ test.netlink = { end, test_getlink = function() local i = assert(nl.getlink()) - local st, err = S.stat("/sys/class/net") -- just in case sysfs not mounted - if not st then error "skipped" end - local df = assert(util.dirtable("/sys/class/net", true)) - assert_equal(#df, #i, "expect same number of interfaces as /sys/class/net") assert(i.lo, "expect a loopback interface") local lo = i.lo assert(lo.flags.up, "loopback interface should be up") @@ -486,7 +487,9 @@ test.netlink = { test_interfaces = function() local i = assert(nl.interfaces()) assert_equal(tostring(i.lo.inet[1].addr), "127.0.0.1", "loopback ipv4 on lo") - assert_equal(tostring(i.lo.inet6[1].addr), "::1", "loopback ipv6 on lo") + if i.lo.inet6[1] then + assert_equal(tostring(i.lo.inet6[1].addr), "::1", "loopback ipv6 on lo") + end end, test_newlink_flags_root = function() local p = assert(S.clone()) @@ -566,12 +569,6 @@ test.netlink = { assert(i.dummy0:down()) assert(i.dummy0:delete()) end, - test_interface_set_macaddr_fail = function() - local i = assert(nl.interfaces()) - assert(i.lo, "expect to find lo") - local ok, err = nl.newlink(i.lo.index, 0, 0, 0, "address", "46:9d:c9:06:dd:dd") - assert(not ok and err and (err.PERM or err.OPNOTSUPP), "should not be able to change macaddr on lo") - end, test_newlink_error_root = function() local ok, err = nl.newlink(-1, 0, "up", "up") assert(not ok, "expect bogus newlink to fail") @@ -670,6 +667,7 @@ test.netlink = { test_getroute_inet6 = function() local r = assert(nl.routes("inet6", "unspec")) local nr = r:match("::1/128") + if #nr == 0 then error "skipped" end -- no ipv6 support assert(#nr >= 1, "expect at least one matched route") -- one of my machines has two local lor = nr[1] assert_equal(tostring(lor.source), "::", "expect empty source route") @@ -766,6 +764,10 @@ test.netlink = { assert_equal(#n, 1) assert_equal(tostring(n[1].lladdr), "46:9d:c9:06:dd:dd") assert_equal(tostring(n[1].dst), "10.0.0.2") + assert_equal(tostring(n[1].dest), "10.0.0.2") + assert_equal(n[1].ifindex, i.dummy0.index) + assert_equal(n[1].state, c.NUD.PERMANENT) + assert_equal(n[1].flags, 0) assert(nl.delneigh(i.dummy0, {family = "inet"}, "dst", "10.0.0.2", "lladdr", "46:9d:c9:06:dd:dd")) assert(i.dummy0:delete()) end, @@ -1315,9 +1317,152 @@ test.bpf = { end, } +-- test eBPF filters +if S.bpf and not S.__rump then + test.bpf_root = {} + test.bpf_root.test_bpf_map_create = function() + local bpf = t.sock_filters(1, { + t.sock_filter("RET,K", 0) + }) + -- Update + local key, klen = ffi.new('int [1]', 0xdead), ffi.sizeof('int') + local fd, err = assert(S.bpf_map_create(c.BPF_MAP.HASH, klen, klen, 10)) + assert(S.bpf_map_op(c.BPF_CMD.MAP_UPDATE_ELEM, fd, key, key) == 0) + -- Retrieve + local val = ffi.new('int [1]', 0xbeef) + local ok, err = S.bpf_map_op(c.BPF_CMD.MAP_LOOKUP_ELEM, fd, key, val) + assert(ok and key[0] == val[0]) + S.close(fd) + end + test.bpf_root.test_bpf_prog_load = function() + local bpf = t.bpf_insns(2, { + t.bpf_insn("ALU64,MOV,K", 0, 0, 0, 1), + t.bpf_insn("JMP,EXIT"), + }) + local fd, err, log = S.bpf_prog_load(c.BPF_PROG.SOCKET_FILTER, bpf, 2) + if not fd then assert(false, err..': '..log) end + S.close(fd) + end +end + +-- test perf_event_open +if S.perf_event_open and not S.__rump then + test.perf_root = {} + test.perf_root.test_perf_open = function () + -- Create perf event attribute with dummy config + local pe = t.perf_event_attr1() + pe[0].type = "software" + pe[0].config = "sw_dummy" + pe[0].disabled = 1 + pe[0].exclude_kernel = 1 + pe[0].exclude_hv = 1 + -- Open event and read a dummy value + local fd = S.perf_event_open(pe) + fd:ioctl("PERF_EVENT_IOC_ENABLE", 0) + local count = t.buffer(ffi.sizeof('int64_t')) + local rb = fd:read(count, ffi.sizeof(count)) + fd:ioctl("PERF_EVENT_IOC_DISABLE", 0) + fd:close() + -- Check just the size of read count + assert(rb == ffi.sizeof(count)) + end + test.perf_root.test_perf_sw = function () + -- Read out a software perf counter + local pe = t.perf_event_attr1() + pe[0].type = "software" + pe[0].config = "sw_cpu_clock" + pe[0].exclude_kernel = 1 + pe[0].exclude_hv = 1 + -- Open event and read a dummy value + -- @note perf event fd has CLO_EXEC, must not fork + local reader = t.perf_reader(S.perf_event_open(pe)) + reader:start() + local ticks = reader:read() + reader:close() + -- Check just the size of read count + assert(ticks > 0) + end + test.perf_root.test_perf_attach = function () + if not S.statfs("/sys/kernel/debug/tracing/events") then + print('skipping') -- debugfs must be mounted + return + end + -- Get tracepoint id + local tp = assert(S.perf_tracepoint("/sys/kernel/debug/tracing/events/syscalls/sys_enter_getcwd")) + local reader = S.perf_attach_tracepoint(tp) + -- Trace getcwd() syscall + reader:start() + S.getcwd() + S.getcwd() + local cnt = reader:read() + reader:stop() + reader:close() + -- Check value + assert(cnt == 2) + end + test.perf_root.test_perf_sampling = function () + if not S.statfs("/sys/kernel/debug/tracing/events") then + print('skipping') -- debugfs must be mounted + return + end + local sample_t = ffi.typeof [[ + struct { + struct perf_event_header header; + uint32_t size; + struct { + uint16_t id; + uint8_t flags; + uint8_t preempt_count; + int pid; + }; + uint64_t ip; + } * + ]] + -- Get tracepoint id + local tp = assert(S.perf_tracepoint("/sys/kernel/debug/tracing/events/syscalls/sys_enter_getcwd")) + local reader = S.perf_attach_tracepoint(tp) + -- Trace getcwd() syscall + reader:mmap() + reader:start() + for i = 1,10 do S.getcwd() end + reader:stop() + -- Read samples from mmap + local cnt = 0; + for len,e in ipairs(reader) do + if e.type ~= c.PERF_RECORD.SAMPLE then break end + -- Check if we're the caller + e = ffi.cast(sample_t, e) + if e.pid == S.getpid() then + cnt = cnt + 1 + end + end + reader:close() + -- Check if we got all samples + assert(cnt == 10) + end + test.perf_root.test_perf_kprobe = function () + if not S.statfs("/sys/kernel/debug/tracing/events") then + print('skipping') -- debugfs must be mounted + return + end + -- Attach a kprobe to open() + local tp = assert(S.perf_probe("kprobe", "myprobe", "do_sys_open $retval", true)) + local reader = S.perf_attach_tracepoint(tp) + reader:start() + S.open("/tmp", "rdonly") + local cnt = reader:read() + reader:stop() + reader:close() + -- Detach probe + S.perf_probe("kprobe", "myprobe", false) + -- See if we hit the probe + assert(cnt == 1) + end +end + -- TODO remove arch tests. Unclear if my ppc/arm does not support or a bug, retest later with newer kernel -- still ppc issues with 3.12.6 ppc, need to debug more, and mips issues -if not (abi.arch == "ppc64le" or abi.arch == "ppc" or abi.arch == "arm" or abi.arch == "mips" or S.__rump) then -- cannot test on rump as uses clone() +if not (abi.arch == "ppc64le" or abi.arch == "ppc" or abi.arch == "mips" or S.__rump) then -- cannot test on rump as uses clone() test.seccomp = { test_no_new_privs = function() -- this must be done for non root to call type 2 seccomp local p = assert(S.clone()) @@ -1481,7 +1626,7 @@ test.seccomp = { local pid = S.getpid() local ofd, err = S.open("/dev/null", "rdonly") -- not allowed fork_assert(not ofd, "should not run open") - fork_assert(err.errno == nr.SYS.open, "syscall that did not work should be open") + fork_assert(err.errno == nr.SYS.open or err.errno == nr.SYS.openat, "syscall that did not work should be open[at]") local pid = S.getpid() S._exit() else @@ -1764,7 +1909,7 @@ test.processes_linux = { fork_assert(S.getppid() == pid0, "parent pid should be previous pid") S.exit(23) else -- parent - local infop, rusage = assert(S.waitid("all", 0, "exited, stopped, continued")) + local infop, rusage = assert(S.waitid("pid", pid, "exited, stopped, continued")) assert_equal(infop.signo, c.SIG.CHLD, "waitid to return SIGCHLD") assert_equal(infop.status, 23, "exit should be 23") assert_equal(infop.code, c.SIGCLD.EXITED, "normal exit expected") @@ -1784,6 +1929,9 @@ test.processes_linux = { assert(status.EXITSTATUS == 23, "exit should be 23") end end, + test_tid = function() + assert(S.getpid() == S.gettid(), "PID should be the same as TID") + end, } test.scheduler = { test_getcpu = function() @@ -1850,13 +1998,11 @@ test.swap = { assert_equal(c.SWAP_FLAG["23, discard"], c.SWAP_FLAG["prefer, discard"] + bit.lshift(23, c.SWAP_FLAG["prio_shift"])) end, test_swap_fail = function() - local ex = "PERM" -- EPERM if not root - if S.geteuid() == 0 then ex = "INVAL" end local ok, err = S.swapon("/dev/null", "23, discard") if not ok and err.NOSYS then return end -- Android does not implement swap, so skip test - assert(not ok and err[ex], "should not create swap on /dev/null") + assert(not ok and (err.PERM or err.INVAL), "should not create swap on /dev/null") local ok, err = S.swapoff("/dev/null") - assert(not ok and err[ex], "no swap on /dev/null") + assert(not ok and (err.PERM or err.INVAL), "no swap on /dev/null") end, -- TODO need mkswap to test success } diff --git a/lib/ljsyscall/include/luaunit/luaunit.lua b/lib/ljsyscall/test/luaunit.lua similarity index 100% rename from lib/ljsyscall/include/luaunit/luaunit.lua rename to lib/ljsyscall/test/luaunit.lua diff --git a/lib/ljsyscall/test/netbsd.lua b/lib/ljsyscall/test/netbsd.lua index 9570d393d4..3039ed49f9 100644 --- a/lib/ljsyscall/test/netbsd.lua +++ b/lib/ljsyscall/test/netbsd.lua @@ -2,7 +2,7 @@ local function init(S) -local helpers = require "syscall.helpers" +local helpers = require "test.helpers" local types = S.types local c = S.c local abi = S.abi diff --git a/lib/ljsyscall/test/openbsd.lua b/lib/ljsyscall/test/openbsd.lua index ce2b4a7125..755c32094a 100644 --- a/lib/ljsyscall/test/openbsd.lua +++ b/lib/ljsyscall/test/openbsd.lua @@ -2,7 +2,7 @@ local function init(S) -local helpers = require "syscall.helpers" +local helpers = require "test.helpers" local types = S.types local c = S.c local abi = S.abi diff --git a/lib/ljsyscall/test/osx.lua b/lib/ljsyscall/test/osx.lua index 75ff6b932c..5346ccdbce 100644 --- a/lib/ljsyscall/test/osx.lua +++ b/lib/ljsyscall/test/osx.lua @@ -2,7 +2,7 @@ local function init(S) -local helpers = require "syscall.helpers" +local helpers = require "test.helpers" local types = S.types local c = S.c local abi = S.abi diff --git a/lib/ljsyscall/test/rump.lua b/lib/ljsyscall/test/rump.lua index d46a47cb3a..afd161209d 100644 --- a/lib/ljsyscall/test/rump.lua +++ b/lib/ljsyscall/test/rump.lua @@ -3,7 +3,7 @@ local function init(S) -local helpers = require "syscall.helpers" +local helpers = require "test.helpers" local types = S.types local c = S.c local abi = S.abi diff --git a/lib/ljsyscall/include/strict/strict.lua b/lib/ljsyscall/test/strict.lua similarity index 100% rename from lib/ljsyscall/include/strict/strict.lua rename to lib/ljsyscall/test/strict.lua diff --git a/lib/ljsyscall/test/test.lua b/lib/ljsyscall/test/test.lua index 64c875dc70..fddee0a6be 100644 --- a/lib/ljsyscall/test/test.lua +++ b/lib/ljsyscall/test/test.lua @@ -4,12 +4,9 @@ arg = arg or {} --- only use this installation for tests -package.path = "./?.lua;" +local strict = require "test.strict" -local strict = require "include.strict.strict" - -local helpers = require "syscall.helpers" +local helpers = require "test.helpers" local assert = helpers.assert @@ -108,7 +105,7 @@ local function assert_equal(...) end USE_EXPECTED_ACTUAL_IN_ASSERT_EQUALS = true -- strict wants this to be set -local luaunit = require "include.luaunit.luaunit" +local luaunit = require "test.luaunit" local sysfile = debug.getinfo(S.open).source local cov = {active = {}, cov = {}} @@ -953,7 +950,7 @@ test_file_operations_at = { local fd = assert(S.open(".")) assert(util.writefile(tmpfile, teststring, "RWXU")) local stat = assert(fd:fstatat(tmpfile)) - assert(stat.size == #teststring, "expect length to br what was written") + assert(stat.size == #teststring, "expect length to be what was written") assert(fd:close()) assert(S.unlink(tmpfile)) end, @@ -1356,7 +1353,9 @@ test_sockets_pipes = { assert(ss:nonblock()) local sa = assert(t.sockaddr_in6(0, "loopback")) assert_equal(sa.family, c.AF.INET6) - assert(ss:bind(sa)) + ok, err = ss:bind(sa) + if not ok and err.ADDRNOTAVAIL then error "skipped" end + assert(ok, err) local ba = assert(ss:getsockname()) assert_equal(ba.family, c.AF.INET6) assert(ss:listen()) -- will fail if we did not bind @@ -1410,6 +1409,7 @@ test_sockets_pipes = { local ok, err = cs:connect(ba6) local as = ss:accept() local ok, err = cs:connect(ba6) + if err.ADDRNOTAVAIL or err.NETUNREACH then error "skipped" end assert(ok or err.ISCONN, "unexpected error " .. tostring(err)); assert(ss:block()) -- force accept to wait as = as or assert(ss:accept()) @@ -1455,7 +1455,9 @@ test_sockets_pipes = { assert(ss:setsockopt(c.IPPROTO.IPV6, c.IPV6.V6ONLY, 1)) local sa = assert(t.sockaddr_in6(0, "loopback")) assert_equal(sa.family, c.AF.INET6) - assert(ss:bind(sa)) + ok, err = ss:bind(sa) + if not ok and err.ADDRNOTAVAIL then error "skipped" end + assert(ok, err) local ba = assert(ss:getsockname()) assert_equal(ba.family, c.AF.INET6) assert(ss:listen()) -- will fail if we did not bind @@ -1497,7 +1499,9 @@ test_sockets_pipes = { assert(ss:setsockopt(c.IPPROTO.IPV6, c.IPV6.V6ONLY, 1)) local sa = assert(t.sockaddr_in6(0, "loopback")) assert_equal(sa.family, c.AF.INET6) - assert(ss:bind(sa)) + ok, err = ss:bind(sa) + if not ok and err.ADDRNOTAVAIL then error "skipped" end + assert(ok, err) local ba = assert(ss:getsockname()) assert_equal(ba.family, c.AF.INET6) assert(ss:listen()) -- will fail if we did not bind @@ -1534,7 +1538,9 @@ test_sockets_pipes = { local loop6 = "::1" local cs = assert(S.socket("inet6", "dgram")) local sa = assert(t.sockaddr_in6(0, loop6)) - assert(ss:bind(sa)) + ok, err = ss:bind(sa) + if not ok and err.ADDRNOTAVAIL then error "skipped" end + assert(ok, err) local bsa = ss:getsockname() -- find bound address local n = assert(cs:sendto(teststring, nil, c.MSG.NOSIGNAL or 0, bsa)) -- got a sigpipe here on MIPS local f = assert(ss:recv(buf, size)) @@ -1645,7 +1651,9 @@ test_sockets_pipes = { assert(s, err) local s = assert(S.socket("inet6", "stream")) local sa = t.sockaddr_in6(0, "loopback") - assert(s:bind(sa)) + ok, err = s:bind(sa) + if not ok and err.ADDRNOTAVAIL then error "skipped" end + assert(ok, err) assert_equal(s:getsockopt("socket", "keepalive"), 0) assert(s:setsockopt("socket", "keepalive", 1)) assert(s:getsockopt("socket", "keepalive") ~= 0) @@ -1668,7 +1676,9 @@ test_sockets_pipes = { assert(s, err) local s = assert(S.socket("inet6", "stream")) local sa = t.sockaddr_in6(0, "loopback") - assert(s:bind(sa)) + ok, err = s:bind(sa) + if not ok and err.ADDRNOTAVAIL then error "skipped" end + assert(ok, err) assert_equal(s:getsockopt(c.IPPROTO.TCP, c.TCP.NODELAY), 0) assert(s:setsockopt(c.IPPROTO.TCP, c.TCP.NODELAY, 1)) assert(s:getsockopt(c.IPPROTO.TCP, c.TCP.NODELAY) ~= 0) @@ -1881,7 +1891,8 @@ test_termios = { local ws, err = S.stdout:ioctl("TIOCGWINSZ") if not ws and err.NOTTY then error "skipped" end -- stdout might not be a tty in test env assert(ws, err) - assert(ws.row > 0 and ws.col > 0) + if ws.row == 0 and ws.col == 0 then error "skipped" end + assert(ws.row > 0 and ws.col > 0, "expect positive winsz") end, } @@ -1946,9 +1957,6 @@ test_raw_socket = { assert(cs == expected, "expect correct ip checksum: got " .. string.format("%%%04X", cs) .. " expected " .. string.format("%%%04X", expected)) end, test_raw_udp_root = function() -- TODO create some helper functions, this is not very nice - - local h = require "syscall.helpers" -- TODO should not have to use later - local loop = "127.0.0.1" local raw = assert(S.socket("inet", "raw", "raw")) -- needed if not on Linux @@ -1972,8 +1980,8 @@ test_raw_socket = { local ca = cl:getsockname() -- TODO iphdr should have __index helpers for endianness etc (note use raw s_addr) - iphdr[0] = {ihl = 5, version = 4, tos = 0, id = 0, frag_off = h.htons(0x4000), ttl = 64, protocol = c.IPPROTO.UDP, check = 0, - saddr = sa.sin_addr.s_addr, daddr = ca.sin_addr.s_addr, tot_len = h.htons(len)} + iphdr[0] = {ihl = 5, version = 4, tos = 0, id = 0, frag_off = helpers.htons(0x4000), ttl = 64, protocol = c.IPPROTO.UDP, check = 0, + saddr = sa.sin_addr.s_addr, daddr = ca.sin_addr.s_addr, tot_len = helpers.htons(len)} --udphdr[0] = {src = sport, dst = ca.port, length = udplen} -- doesnt work with metamethods udphdr[0].src = sport @@ -2215,9 +2223,6 @@ test_proc = { local found = false if #ps == 0 then error "skipped" end -- not mounted but mount point exists for i = 1, #ps do - if ps[i].pid == 1 then - assert(ps[i].cmdline:find("init") or ps[i].cmdline:find("systemd"), "expect init or systemd to be process 1 usually") - end if ps[i].pid == me then found = true end end assert(found, "expect to find my process in ps") @@ -2234,7 +2239,6 @@ test_proc = { local p = util.proc(1) if not p.cmdline then error "skipped" end -- no files found, /proc not mounted assert(p and p.cmdline, "expect init to have cmdline") - assert(p.cmdline:find("init") or p.cmdline:find("systemd"), "expect init or systemd to be process 1 usually") end, } @@ -2305,15 +2309,13 @@ test_mmap = { test_processes = { test_nice = function() local n = assert(S.getpriority("process")) - assert_equal(n, 0, "process should start at priority 0") - local nn = assert(S.nice(1)) - assert_equal(nn, 1) - local nn = assert(S.setpriority("process", 0, 1)) -- sets to 1, which it already is + --assert_equal(n, 0, "process should start at priority 0") + --local nn = assert(S.nice(1)) + --assert_equal(nn, 1) + --local nn = assert(S.setpriority("process", 0, n)) -- sets to 1, which it already is end, test_fork_wait = function() local pid0 = S.getpid() - assert(pid0 > 1, "expecting my pid to be larger than 1") - assert(S.getppid() > 1, "expecting my parent pid to be larger than 1") local pid = assert(S.fork()) if pid == 0 then -- child fork_assert(S.getppid() == pid0, "parent pid should be previous pid") @@ -2327,8 +2329,6 @@ test_processes = { end, test_fork_waitpid = function() local pid0 = S.getpid() - assert(pid0 > 1, "expecting my pid to be larger than 1") - assert(S.getppid() > 1, "expecting my parent pid to be larger than 1") local pid = assert(S.fork()) if pid == 0 then -- child fork_assert(S.getppid() == pid0, "parent pid should be previous pid") @@ -2357,8 +2357,6 @@ test_processes = { end, test_fork_wait4 = function() local pid0 = S.getpid() - assert(pid0 > 1, "expecting my pid to be larger than 1") - assert(S.getppid() > 1, "expecting my parent pid to be larger than 1") local pid = assert(S.fork()) if pid == 0 then -- child fork_assert(S.getppid() == pid0, "parent pid should be previous pid") @@ -2373,8 +2371,6 @@ test_processes = { end, test_fork_wait3 = function() local pid0 = S.getpid() - assert(pid0 > 1, "expecting my pid to be larger than 1") - assert(S.getppid() > 1, "expecting my parent pid to be larger than 1") local pid = assert(S.fork()) if pid == 0 then -- child fork_assert(S.getppid() == pid0, "parent pid should be previous pid") @@ -2492,7 +2488,8 @@ if S.geteuid() == 0 then local i = assert(nl.interfaces()) local lo = assert(i.lo) assert(lo:up()) - assert(S.mount("none", "/sys", "sysfs")) + -- Do not destroy "/sys" if it is mounted + assert(S.statfs("/sys/kernel") or S.mount("none", "/sys", "sysfs")) end else -- not Linux -- run all tests, no namespaces available diff --git a/lib/luajit/src/lj_trace.c b/lib/luajit/src/lj_trace.c index ccb7629841..3f32710c20 100644 --- a/lib/luajit/src/lj_trace.c +++ b/lib/luajit/src/lj_trace.c @@ -6,6 +6,8 @@ #define lj_trace_c #define LUA_CORE +#include + #include "lj_obj.h" @@ -47,6 +49,45 @@ void lj_trace_err_info(jit_State *J, TraceError e) lj_err_throw(J->L, LUA_ERRRUN); } +/* -- Hotcount decay ------------------------------------------------------ */ + +/* We reset all hotcounts every second. This is a rough way to establish a +** relation with elapsed time so that hotcounts provide a measure of frequency. +** +** The concrete goal is to ensure that the JIT will trace code that becomes hot +** over a short duration, but not code that becomes hot over, say, the course +** of an hour. +** +** The "one second" constant is certainly tunable. +** */ + +static void trace_clearsnapcounts(jit_State *J); /* Forward decl. */ + +static inline uint64_t gettime_ns (void) +{ + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return ts.tv_sec * 1000000000LL + ts.tv_nsec; +} + +/* Timestamp (ns) of last hotcount reset. */ +static uint64_t hotcount_decay_ts; + +/* Decay hotcounts every second. */ +int hotcount_decay (jit_State *J) +{ + uint64_t ts = gettime_ns(); + int decay = (ts - hotcount_decay_ts) > 1000000000LL; /* 1s elapsed? */ + if (decay) { + /* Reset hotcounts. */ + lj_dispatch_init_hotcount(J2G(J)); + trace_clearsnapcounts(J); + hotcount_decay_ts = ts; + } + return decay; +} + + /* -- Trace management ---------------------------------------------------- */ /* The current trace is first assembled in J->cur. The variable length @@ -277,6 +318,8 @@ int lj_trace_flushall(lua_State *L) memset(J->penalty, 0, sizeof(J->penalty)); /* Reset hotcounts. */ lj_dispatch_init_hotcount(J2G(J)); + /* Initialize hotcount decay timestamp. */ + hotcount_decay_ts = gettime_ns(); /* Free the whole machine code and invalidate all exit stub groups. */ lj_mcode_free(J); memset(J->exitstubgroup, 0, sizeof(J->exitstubgroup)); @@ -318,6 +361,21 @@ void lj_trace_freestate(global_State *g) lj_mcode_free(J); } +/* Clear all trace snap counts (side-exit hot counters). */ +static void trace_clearsnapcounts(jit_State *J) +{ + int i, s; + GCtrace *t; + /* Clear hotcounts for all snapshots of all traces. */ + for (i = 1; i < TRACE_MAX; i++) { + t = traceref(J, i); + if (t != NULL) + for (s = 0; s < t->nsnap; s++) + if (t->snap[s].count != SNAPCOUNT_DONE) + t->snap[s].count = 0; + } +} + /* -- Penalties and blacklisting ------------------------------------------ */ /* Blacklist a bytecode instruction. */ @@ -655,6 +713,9 @@ void lj_trace_ins(jit_State *J, const BCIns *pc) void lj_trace_hot(jit_State *J, const BCIns *pc) { /* Note: pc is the interpreter bytecode PC here. It's offset by 1. */ + if (hotcount_decay(J)) + /* Check for hotcount decay, do nothing if hotcounts have decayed. */ + return; ERRNO_SAVE /* Reset hotcount. */ hotcount_set(J2GG(J), pc, J->param[JIT_P_hotloop]*HOTCOUNT_LOOP); @@ -671,6 +732,9 @@ void lj_trace_hot(jit_State *J, const BCIns *pc) /* Check for a hot side exit. If yes, start recording a side trace. */ static void trace_hotside(jit_State *J, const BCIns *pc) { + if (hotcount_decay(J)) + /* Check for hotcount decay, do nothing if hotcounts have decayed. */ + return; SnapShot *snap = &traceref(J, J->parent)->snap[J->exitno]; if (!(J2G(J)->hookmask & HOOK_GC) && isluafunc(curr_func(J->L)) && diff --git a/lib/pflua/src/pf/codegen.dasl b/lib/pflua/src/pf/codegen.dasl index 7359d8ed9b..8a2b2f194d 100644 --- a/lib/pflua/src/pf/codegen.dasl +++ b/lib/pflua/src/pf/codegen.dasl @@ -299,7 +299,7 @@ local function compile(instructions, alloc, dump) local reg1, reg2 = alloc[instr[2]], alloc[instr[3]] | or Rq(reg1), Rq(reg2) - elseif itype == "or" then + elseif itype == "xor" then local reg1, reg2 = alloc[instr[2]], alloc[instr[3]] | xor Rq(reg1), Rq(reg2) diff --git a/lib/pflua/src/pf/regalloc.lua b/lib/pflua/src/pf/regalloc.lua index 8696b7a5df..52edfcdf17 100644 --- a/lib/pflua/src/pf/regalloc.lua +++ b/lib/pflua/src/pf/regalloc.lua @@ -129,11 +129,6 @@ local function live_intervals(instrs) return order end --- All available registers, tied to unix x64 ABI -local caller_regs = {11, 10, 9, 8, 6, 2, 1, 0} -local callee_regs = {15, 14, 13, 12, 3} -local num_regs = #caller_regs + #callee_regs - -- Check if a register is free in the freelist local function is_free(seq, reg) for _, val in ipairs(seq) do @@ -182,23 +177,31 @@ local function delete_useless_movs(ir, alloc) end end +-- All available registers, tied to unix x64 ABI +x86_regs = { + caller_regs = {11, 10, 9, 8, 6, 2, 1, 0}, + callee_regs = {15, 14, 13, 12, 3}, + len = 6 -- %rsi +} + -- Do register allocation with the given IR -- Returns a register allocation and potentially mutates -- the ir for optimizations -function allocate(ir) +function allocate(ir, regs) + regs = regs or x86_regs local intervals = live_intervals(ir) local active = {} local next_spill = 0 -- caller-save registers, use these first - local free_caller = utils.dup(caller_regs) + local free_caller = utils.dup(regs.caller_regs) -- callee-save registers, if we have to - local free_callee = utils.dup(callee_regs) + local free_callee = utils.dup(regs.callee_regs) - local allocation = { len = 6, -- %rsi + local allocation = { len = regs.len, callee_saves = {}, spills = {} } - remove_free(free_caller, 6) + remove_free(free_caller, allocation.len) local function expire_old(interval) local to_expire = {} @@ -213,9 +216,9 @@ function allocate(ir) table.insert(to_expire, idx) -- figure out which free list this register is supposed to be on - if is_free(caller_regs, reg) then + if is_free(regs.caller_regs, reg) then table.insert(free_caller, reg) - elseif is_free(callee_regs, reg) then + elseif is_free(regs.callee_regs, reg) then table.insert(free_callee, reg) else error("unknown register") diff --git a/lib/pflua/src/pf/types.lua b/lib/pflua/src/pf/types.lua index 4ad1758864..5a727b14f8 100644 --- a/lib/pflua/src/pf/types.lua +++ b/lib/pflua/src/pf/types.lua @@ -38,10 +38,6 @@ struct pcap_pkthdr { -- with the high-bit set as negative int32_t values, so we do the same -- for all of our 32-bit values including the "k" field in BPF -- instructions. -ffi.cdef[[ -struct bpf_insn { uint16_t code; uint8_t jt, jf; int32_t k; }; -struct bpf_program { uint32_t bf_len; struct bpf_insn *bf_insns; }; -]] local bpf_program_mt = { __len = function (program) return program.bf_len end, __index = function (program, idx) @@ -50,8 +46,8 @@ local bpf_program_mt = { end } -bpf_insn = ffi.typeof("struct bpf_insn") -bpf_program = ffi.metatype("struct bpf_program", bpf_program_mt) +bpf_insn = ffi.typeof("struct { uint16_t code; uint8_t jt, jf; int32_t k; }") +bpf_program = ffi.metatype("struct { uint32_t bf_len; struct bpf_insn *bf_insns; }", bpf_program_mt) pcap_record = ffi.typeof("struct pcap_record") pcap_pkthdr = ffi.typeof("struct pcap_pkthdr") diff --git a/src/README.md b/src/README.md index 3f1f34ff7d..e02bddb988 100644 --- a/src/README.md +++ b/src/README.md @@ -422,10 +422,21 @@ Allocate packet and fill it with *length* bytes from *pointer*. Allocate packet and fill it with the contents of *string*. -— Function **packet.clone_to_memory* *pointer* *packet* +— Function **packet.account_free** *packet* + +Increment internal engine statistics (*frees*, *freebytes*, *freebits*) as if +*packet* were freed, but do not actually put it back onto the freelist. + +This function is intended to be used by I/O apps in special cases that need +more finegrained control over packet freeing. + +— Function **packet.free_internal** *packet* + +Free *packet* and put it back onto the freelist, but do not increment internal +engine statistics (*frees*, *freebytes*, *freebits*). + +See **packet.account_free**, **packet.free**. -Creates an exact copy of at memory pointed to by *pointer*. *Pointer* must -point to a `packet.packet_t`. ## Memory (core.memory) diff --git a/src/apps/intel_avf/README.md b/src/apps/intel_avf/README.md index 72a49b1193..9eccd78f36 100644 --- a/src/apps/intel_avf/README.md +++ b/src/apps/intel_avf/README.md @@ -18,20 +18,55 @@ The links are named `input` and `output`. *Required*. The PCI address of the NIC as a string. +— Key **vlan** + +*Optional*. VLAN id used for filtering packets. If specified, VLAN tags are +stripped for incoming packets and inserted for outgoing packets. + +— Key **macs** + +*Optional*. Additional unicast or multicast MACs to listen to. +The default is the empty array `{}`. + +— Key **nqueues** + +*Optional*. Number of RSS queues to configure. If specified you need to use +the `intel_avf.IO` app to attach for I/O for each respective queue. + — Key **ring_buffer_size** *Optional*. Number of DMA descriptors to use i.e. size of the DMA transmit and receive queues. Must be a multiple of 128. Default is not specified but assumed to be broadly applicable. +## IO app + +The `intel_avf.IO` app provides a driver for a single RSS queue of a +Virtual Function (see *nqueues*). + +The links are names `input` and `output`. + + DIAGRAM: Intel_avf_IO + +-----------+ + | | + input ---->* IO *----> output + | | + +-----------+ +### Configuration + +— Key **pciaddr** + +*Required*. The PCI address of the NIC as a string. + +— Key **queue** + +*Required*. The queue number of the respective RSS queue, starting from zero. + ## Supported Hardware Ethernet controller [0200]: Intel Corporation Ethernet Virtual Function 700 Series [8086:154c] (rev 02) ## Unsupported features -* Multiple queues per VF. This driver supports a single queue. The spec allows for up to 4 queues. -* RSS with only 1 queue RSS doesn't make sense. -* Multiple vlans are unsupported, `ip link` can be used to map all traffic to a single vlan. -* Multiple MAC addresses are unsupported, `ip link` can be used to set the mac before snabb startup. +* Multiple vlans are unsupported, `vlan` can be used to strip/insert a single vlan ID. * All of the advanced offload features are unsupported. * 16 byte RX descriptors are unsupported. diff --git a/src/apps/intel_avf/intel_avf.lua b/src/apps/intel_avf/intel_avf.lua index cef2993d2e..eacadcc407 100644 --- a/src/apps/intel_avf/intel_avf.lua +++ b/src/apps/intel_avf/intel_avf.lua @@ -7,11 +7,12 @@ module(..., package.seeall) local ffi = require("ffi") local lib = require("core.lib") +local sync = require("core.sync") local macaddress = require("lib.macaddress") local pci = require("lib.hardware.pci") local register = require("lib.hardware.register") local tophysical = core.memory.virtual_to_physical -local band, lshift, rshift = bit.band, bit.lshift, bit.rshift +local band, lshift, rshift, bor = bit.band, bit.lshift, bit.rshift, bit.bor local transmit, receive, empty = link.transmit, link.receive, link.empty local counter = require("core.counter") local shm = require("core.shm") @@ -24,6 +25,9 @@ local MAC_ADDR_BYTE_LEN = 6 Intel_avf = { config = { pciaddr = { required=true }, + nqueues = {}, + vlan = {}, + macs = {default={}}, ring_buffer_size = {default=2048} } } @@ -107,43 +111,66 @@ local virtchnl_msg_t = ffi.typeof([[ ]]) local virtchnl_msg_ptr_t = ffi.typeof("$ *", virtchnl_msg_t) -local virtchnl_q_pair_t = ffi.typeof([[ +local virtchnl_txq_info_t = ffi.typeof([[ struct { uint16_t vsi_id; - uint16_t num_queue_pairs; - uint32_t pad; + uint16_t queue_id; + uint16_t ring_len; + uint16_t deprecated0; + uint64_t dma_ring_addr; + uint64_t deprecated1; + } __attribute__((packed)) +]]) - uint16_t tx_vsi_id; - uint16_t tx_queue_id; - uint16_t tx_ring_len; - uint16_t tx_deprecated0; - uint64_t tx_dma_ring_addr; - uint64_t tx_deprecated1; - - uint16_t rx_vsi_id; - uint16_t rx_queue_id; - uint32_t rx_ring_len; - uint16_t rx_hdr_size; - uint16_t rx_deprecated0; - uint32_t rx_databuffer_size; - uint32_t rx_max_pkt_size; - uint32_t rx_pad0; - uint64_t rx_dma_ring_addr; - uint32_t rx_deprecated1; - uint32_t rx_pad1; +local virtchnl_rxq_info_t = ffi.typeof([[ + struct { + uint16_t vsi_id; + uint16_t queue_id; + uint32_t ring_len; + uint16_t hdr_size; + uint16_t deprecated0; + uint32_t databuffer_size; + uint32_t max_pkt_size; + uint32_t pad0; + uint64_t dma_ring_addr; + uint32_t deprecated1; + uint32_t pad1; } __attribute__((packed)) ]]) -local virtchnl_q_pair_ptr_t = ffi.typeof("$ *", virtchnl_q_pair_t) -local virtchnl_ether_addr_t = ffi.typeof([[ +local virtchnl_queue_pair_info_t = ffi.typeof([[ + struct { + /* NOTE: vsi_id and queue_id should be indentical for both queues. */ + $ txq; + $ rxq; + } __attribute__((packed)) +]], virtchnl_txq_info_t, virtchnl_rxq_info_t) + +local virtchnl_queue_config_info_t = ffi.typeof([[ + struct { + uint16_t vsi_id; + uint16_t num_queue_pairs; + uint32_t pad; + $ qpair[1]; + } __attribute__((packed)) +]], virtchnl_queue_pair_info_t) + +local virtchnl_queue_config_info_ptr_t = ffi.typeof("$ *", virtchnl_queue_config_info_t) + +local virtchnl_ether_addr_t = ffi.typeof[[ struct { - uint16_t vsi; - uint16_t num_elements; uint8_t addr[6]; // MAC_ADDR_BYTE_LEN uint8_t pad[2]; } __attribute__((packed)) -]]) -local virtchnl_ether_addr_ptr_t = ffi.typeof("$ *", virtchnl_ether_addr_t) +]] +local virtchnl_ether_addr_list_t = ffi.typeof([[ + struct { + uint16_t vsi; + uint16_t num_elements; + $ list[1]; + } __attribute__((packed)) +]], virtchnl_ether_addr_t) +local virtchnl_ether_addr_list_ptr_t = ffi.typeof("$ *", virtchnl_ether_addr_list_t) local eth_stats_t = ffi.typeof([[ struct { @@ -209,6 +236,7 @@ local virtchnl_rss_key_t = ffi.typeof([[ uint16_t vsi_id; uint16_t key_len; uint8_t key[1]; /* RSS hash key, packed bytes */ + uint8_t pad; } __attribute__((packed)) ]]) local virtchnl_rss_key_ptr_t = ffi.typeof('$*', virtchnl_rss_key_t) @@ -218,6 +246,7 @@ local virtchnl_rss_lut_t = ffi.typeof([[ uint16_t vsi_id; uint16_t lut_entries; uint8_t lut[1]; /* RSS lookup table*/ + uint8_t pad; } __attribute__((packed)) ]]) local virtchnl_rss_lut_ptr_t = ffi.typeof('$*', virtchnl_rss_lut_t) @@ -229,6 +258,15 @@ local virtchnl_rss_hena_t = ffi.typeof([[ ]]) local virtchnl_rss_hena_ptr_t = ffi.typeof('$*', virtchnl_rss_hena_t) +local virtchnl_vlan_filter_list_t = ffi.typeof([[ + struct { + uint16_t vsi_id; + uint16_t num_elements; + uint16_t vlan_id[1]; + } __attribute__((packed)) +]]) +local virtchnl_vlan_filter_list_ptr_t = ffi.typeof('$*', virtchnl_vlan_filter_list_t) + local mbox_q_t = ffi.typeof([[ struct { uint8_t flags0; @@ -246,28 +284,163 @@ local mbox_q_t = ffi.typeof([[ ]]) local mbox_q_ptr_t = ffi.typeof('$*', mbox_q_t) -function Intel_avf:init_tx_q() - self.txdesc = ffi.cast(txdesc_ptr_t, - memory.dma_alloc(ffi.sizeof(txdesc_t) * self.ring_buffer_size)) - ffi.fill(self.txdesc, ffi.sizeof(txdesc_t) * self.ring_buffer_size) - self.txqueue = ffi.new("struct packet *[?]", self.ring_buffer_size) - for i=0, self.ring_buffer_size - 1 do - self.txqueue[i] = nil - self.txdesc[i].cmd_type_offset_bsz = 0 +--------------------------------------------------------------- +-- CXQ (Queue pair control object): +-- +-- A "CXQ" is an object that we define to represent a transmit/receive pair. +-- +-- CXQs are created and deleted by a "Control" app (Intel_avf) and, +-- in between, they are used by "IO" apps to send and receive packets. +-- +-- The lifecycle of a CXQ is managed using a state machine. This is +-- necessary because we allow Control and IO apps to start in any +-- order, for Control and IO apps to start/stop/restart independently, +-- for multiple IO apps to attempt to attach to the same CXQ, and even +-- for apps to stop in one Snabb process and be started in another +-- one. +-- +-- (This design is lifted from the apps.mellanox.connectx driver.) +-- +--------------------------------------------------------------- + +-- CXQs can be in one of five states: +-- INIT: CXQ is being initialized by the control app +-- FREE: CXQ is ready and available for use by an IO app. +-- IDLE: CXQ is owned by an app, but not actively processing right now. +-- BUSY: CXQ is owned by an app and is currently processing (e.g. push/pull). +-- DEAD: CXQ has been deallocated; IO app must try to open a new one. +-- +-- Once a CXQ is closed it stays in the DEAD state forever. However, a +-- replacement CXQ with the same name can be created and existing IO +-- apps can reattach to that instead. This will rerun the state machine. +-- +-- Here are the valid state transitions & when they occur: +-- +-- App Change Why +-- ---- ----------- -------------------------------------------------------- +-- CTRL none->INIT: Control app starts initialization. +-- CTRL INIT->FREE: Control app completes initialization. +-- IO FREE->IDLE: IO app starts and becomes owner of the CXQ. +-- IO IDLE->FREE: IO app stops and releases the CXQ for future use. +-- IO IDLE->BUSY: IO app starts running a pull/push method. +-- IO BUSY->IDLE: IO app stops running a pull/push method. +-- CTRL IDLE->DEAD: Control app closes the CXQ. (Replacement can be created.) +-- +-- These state transitions are *PROHIBITED* for important reasons: +-- +-- App Change Why *PROHIBITED* +-- ------ ----------- -------------------------------------------------------- +-- CTRL BUSY->DEAD Cannot close a CXQ while it is busy (must wait.) +-- IO DEAD->BUSY Cannot use a CXQ that is closed (must check.) +-- * DEAD->* Cannot transition from DEAD (must create new CXQ.) +-- +-- Further notes: +-- +-- Packet buffers for pending DMA (transmit or receive) are freed by +-- the Control app (which can disable DMA first) rather than by the IO +-- app (which shuts down with DMA still active.) +-- +-- Abnormal shutdown of the process hosting the Control app is *not* +-- supported. We just don’t have anywhere to free packets to in that +-- case. + +-- A CXQ is represented by one struct allocated in shared memory. +-- +-- The struct defines the fields in very specific terms so that it can +-- be used directly by the driver code (rather than copying back and +-- forth between the shared memory object and a separate native +-- format.) +local cxq_t = ffi.typeof([[ + struct { + int state[1]; // current state / availability + + // configuration information: + uint32_t qno; // queue number + uint16_t vlan; // 802.1Q vlan tag + uint32_t ring_size; // size of rx/tx rings + + // Transmit state + uint32_t tx_next; + uint32_t tx_cand; + uint32_t tx_desc_free; + $ txdesc; + struct packet *txqueue[64*1024]; + + // Receive state + uint32_t rx_tail; + $ rxdesc; + struct packet *rxqueue[64*1024]; + } __attribute((packed)) +]], txdesc_ptr_t, rxdesc_ptr_t) + +-- CXQ states: +local INIT = 0 -- Implicit initial state due to 0 value. +local BUSY = 1 +local IDLE = 2 +local FREE = 3 +local DEAD = 4 + +-- Release CXQ from IO apps after process termination. +-- Called from core.main.shutdown +function shutdown(pid) + for _, pciaddr in ipairs(shm.children("/"..pid.."/intel_avf")) do + for _, queue in ipairs(shm.children("/"..pid.."/intel_avf/"..pciaddr)) do + local backlink = "/"..pid.."/intel_avf/"..pciaddr.."/"..queue + local ok, cxq = pcall(shm.open, backlink, cxq_t) + if ok then + -- Allow reclaimation of CXQ + sync.cas(cxq.state, IDLE, FREE) + sync.cas(cxq.state, BUSY, FREE) + shm.unlink(backlink) + end + end end end -function Intel_avf:init_rx_q() - self.rxqueue = ffi.new("struct packet *[?]", self.ring_buffer_size) - self.rxdesc = ffi.cast(rxdesc_ptr_t, - memory.dma_alloc(ffi.sizeof(rxdesc_t) * self.ring_buffer_size), 128) +function Intel_avf:init_cxq (qno) + -- Create a shared memory object for controlling the queue pair + local cxq = shm.create("group/pci/"..self.pciaddress.."/"..qno, cxq_t) + cxq.qno = qno + cxq.vlan = self.vlan or 0 + cxq.ring_size = self.ring_buffer_size + self:init_tx_q(cxq) + self:init_rx_q(cxq) + return cxq +end + +function Intel_avf:free_cxq (cxq) + -- Free packets remaining in TX/RX queues. + for i = 0, cxq.ring_size-1 do + if cxq.txqueue[i] ~= nil then + packet.free(cxq.txqueue[i]) + end + packet.free(cxq.rxqueue[i]) + end + shm.unlink("group/pci/"..self.pciaddress.."/"..cxq.qno) + shm.unmap(cxq) +end + +function Intel_avf:init_tx_q(cxq) + cxq.txdesc = ffi.cast(txdesc_ptr_t, memory.dma_alloc(ffi.sizeof(txdesc_t) * self.ring_buffer_size)) + ffi.fill(cxq.txdesc, ffi.sizeof(txdesc_t) * self.ring_buffer_size) + for i=0, self.ring_buffer_size - 1 do + cxq.txqueue[i] = nil + cxq.txdesc[i].cmd_type_offset_bsz = 0 + end + cxq.tx_next = 0 + cxq.tx_cand = 0 + cxq.tx_desc_free = self.ring_buffer_size - 1 +end +function Intel_avf:init_rx_q(cxq) + cxq.rxdesc = ffi.cast(rxdesc_ptr_t, memory.dma_alloc(ffi.sizeof(rxdesc_t) * self.ring_buffer_size)) for i = 0, self.ring_buffer_size-1 do local p = packet.allocate() - self.rxqueue[i] = p - self.rxdesc[i].read.address = tophysical(p.data) - self.rxdesc[i].write.status_err_type_len = 0 + cxq.rxqueue[i] = p + cxq.rxdesc[i].read.address = tophysical(p.data) + cxq.rxdesc[i].write.status_err_type_len = 0 end + cxq.rx_tail = 0 end function Intel_avf:supported_hardware() @@ -362,135 +535,204 @@ function Intel_avf:mbox_setup_txq() self.r.VF_ATQLEN(bits({ ENABLE = 31 }) + self.mbox.q_len) end -function Intel_avf:mbox_sr_q() - local tt = self:mbox_send_buf(virtchnl_q_pair_ptr_t) +function Intel_avf:mbox_sr_q(cxqs) + local tt = self:mbox_send_buf(virtchnl_queue_config_info_ptr_t) tt.vsi_id = self.vsi_id - tt.num_queue_pairs = 1 - - tt.tx_vsi_id = self.vsi_id - tt.tx_queue_id = self.qno - tt.tx_ring_len = self.ring_buffer_size - tt.tx_dma_ring_addr = tophysical(self.txdesc) - - tt.rx_vsi_id = self.vsi_id - tt.rx_queue_id = self.qno - tt.rx_ring_len = self.ring_buffer_size - -- Only 32 byte rxdescs are supported, at least by the PF driver in - -- centos 7 3.10.0-957.1.3.el7.x86_64 - tt.rx_hdr_size = 32 - tt.rx_databuffer_size = packet.max_payload - tt.rx_max_pkt_size = packet.max_payload - tt.rx_dma_ring_addr = tophysical(self.rxdesc) - - self:mbox_sr('VIRTCHNL_OP_CONFIG_VSI_QUEUES', ffi.sizeof(virtchnl_q_pair_t) + 64) + tt.num_queue_pairs = #cxqs + + for i, cxq in ipairs(cxqs) do + tt.qpair[i-1].txq.vsi_id = self.vsi_id + tt.qpair[i-1].txq.queue_id = cxq.qno + tt.qpair[i-1].txq.ring_len = cxq.ring_size + tt.qpair[i-1].txq.dma_ring_addr = tophysical(cxq.txdesc) + + tt.qpair[i-1].rxq.vsi_id = self.vsi_id + tt.qpair[i-1].rxq.queue_id = cxq.qno + tt.qpair[i-1].rxq.ring_len = cxq.ring_size + -- Only 32 byte rxdescs are supported, at least by the PF driver in + -- centos 7 3.10.0-957.1.3.el7.x86_64 + tt.qpair[i-1].rxq.hdr_size = 32 + tt.qpair[i-1].rxq.databuffer_size = packet.max_payload + tt.qpair[i-1].rxq.max_pkt_size = packet.max_payload + tt.qpair[i-1].rxq.dma_ring_addr = tophysical(cxq.rxdesc) + end - self.r.rx_tail = self.r.QRX_TAIL[self.qno] - self.r.tx_tail = self.r.QTX_TAIL[self.qno] - self.rx_tail = 0 - self.r.rx_tail(self.ring_buffer_size - 1) + self:mbox_sr('VIRTCHNL_OP_CONFIG_VSI_QUEUES', + ffi.sizeof(virtchnl_queue_config_info_t) + + ffi.sizeof(virtchnl_queue_pair_info_t) * #cxqs) end -function Intel_avf:mbox_sr_enable_q () +function Intel_avf:mbox_sr_enable_q (nqueues) local tt = self:mbox_send_buf(queue_select_ptr_t) tt.vsi_id = self.vsi_id tt.pad = 0 - tt.rx_queues = bits({ ENABLE = self.qno }) - tt.tx_queues = bits({ ENABLE = self.qno }) + local q_enable_mask = lshift(1, nqueues) - 1 + tt.rx_queues = q_enable_mask + tt.tx_queues = q_enable_mask self:mbox_sr('VIRTCHNL_OP_ENABLE_QUEUES', ffi.sizeof(queue_select_t)) end -function Intel_avf:ringnext (index) - return band(index+1, self.ring_buffer_size - 1) +IO = { + config = { + pciaddr = {required=true}, + queue = {required=true} + } +} + +function IO:new (conf) + local self = setmetatable({}, { __index = IO }) + self.pciaddr = pci.qualified(conf.pciaddr) + self.qno = conf.queue + + -- This is also done in Intel_avf:new() but might not have + -- happened yet. + pci.unbind_device_from_linux(self.pciaddr) + + self.fd = pci.open_pci_resource_unlocked(self.pciaddr, 0) + self.base = pci.map_pci_memory(self.fd) + self.r = {} + Intel_avf.load_registers(self) -- Initialize registers at (self.r.*) + + self.online = false -- True when queue is up and running + self.cxq = nil -- shm object containing queue control information + self.open_throttle = -- Timer to throttle shm open attempts (10ms) + lib.throttle(0.25) + + return self end -function Intel_avf:reclaim_txdesc () - local RS = bits({ RS = 5 }) - local COMPLETE = 15 +function IO:stop() + if self.cxq then + assert(sync.cas(self.cxq.state, IDLE, FREE) or + self.cxq.state[0] == DEAD, + "illegal state detected") + self:close() + end +end - while band(self.txdesc[ self:ringnext(self.tx_cand) ].cmd_type_offset_bsz, COMPLETE) == COMPLETE - and self.tx_desc_free < self.ring_buffer_size - 1 do - local c = self.tx_cand - packet.free(self.txqueue[c]) - self.txqueue[c] = nil - self.tx_cand = self:ringnext(self.tx_cand) - self.tx_desc_free = self.tx_desc_free + 1 +-- Close the queue mapping. +function IO:close () + shm.unlink(self.backlink) + shm.unmap(self.cxq) + self.cxq = nil +end + +-- Open the queue mapping. +function IO:open () + local shmpath = "group/pci/"..self.pciaddr.."/"..self.qno + self.backlink = "intel_avf/"..self.pciaddr.."/"..self.qno + if shm.exists(shmpath) then + shm.alias(self.backlink, shmpath) + self.cxq = shm.open(shmpath, cxq_t) + if sync.cas(self.cxq.state, FREE, IDLE) then + -- Select queue tail registers + self.r.rx_tail = self.r.QRX_TAIL[self.cxq.qno] + self.r.tx_tail = self.r.QTX_TAIL[self.cxq.qno] + else + close() -- Queue was not FREE. + end end end -function Intel_avf:push () - local li = self.input.input +-- Return true on successful activation of the queue. +function IO:activate () + -- If not open then make a request on a regular schedule. + if self.cxq == nil and self.open_throttle() then + self:open() + end + if self.cxq then + -- Careful: Control app may have closed the CXQ. + if sync.cas(self.cxq.state, IDLE, BUSY) then + return true + else + assert(self.cxq.state[0] == DEAD, "illegal state detected") + self:close() + end + end +end + +-- Enter the idle state. +function IO:deactivate () + assert(sync.cas(self.cxq.state, BUSY, IDLE)) +end + +local RS = bits({ RS = 5 }) +local COMPLETE = 15 +function IO:reclaim_txdesc () + + local cxq = self.cxq + while band(cxq.txdesc[band(cxq.tx_cand+1, cxq.ring_size-1)].cmd_type_offset_bsz, COMPLETE) == COMPLETE + and cxq.tx_desc_free < cxq.ring_size - 1 do + local c = cxq.tx_cand + packet.free(cxq.txqueue[c]) + cxq.txqueue[c] = nil + cxq.tx_cand = band(cxq.tx_cand+1, cxq.ring_size-1) + cxq.tx_desc_free = cxq.tx_desc_free + 1 + end +end + +local RS_EOP = bits{ EOP = 4, RS = 5, RSV = 6 } +local IL2TAG1 = bits{ IL2TAG1 = 7} +function IO:transmit (li) if li == nil then return end - local RS_EOP = bits({ EOP = 4, RS = 5 }) + local cxq = self.cxq + local RS_EOP_IL2TAG1 = bor(RS_EOP, (cxq.vlan>0 and IL2TAG1) or 0) + local L2TAG1 = lshift(0ULL+cxq.vlan, 48) local SIZE_SHIFT = 34 self:reclaim_txdesc() - while not empty(li) and self.tx_desc_free > 0 do + while not empty(li) and cxq.tx_desc_free > 0 do local p = receive(li) - -- NB: need to extend size for 4 byte CRC (not clear from the spec.) - local size = lshift(4ULL+p.length, SIZE_SHIFT) - self.txdesc[ self.tx_next ].address = tophysical(p.data) - self.txqueue[ self.tx_next ] = p - self.txdesc[ self.tx_next ].cmd_type_offset_bsz = RS_EOP + size - self.tx_next = self:ringnext(self.tx_next) - self.tx_desc_free = self.tx_desc_free - 1 + local size = lshift(0ULL+p.length, SIZE_SHIFT) -- NB: extend to 64 bit before shift + cxq.txdesc[ cxq.tx_next ].address = tophysical(p.data) + cxq.txqueue[ cxq.tx_next ] = p + cxq.txdesc[ cxq.tx_next ].cmd_type_offset_bsz = bor(RS_EOP_IL2TAG1, size, L2TAG1) + cxq.tx_next = band(cxq.tx_next+1, cxq.ring_size-1) + cxq.tx_desc_free = cxq.tx_desc_free - 1 end C.full_memory_barrier() - self.r.tx_tail(band(self.tx_next, self.ring_buffer_size - 1)) - - if self.sync_stats_throttle() then - self:sync_stats() - end + self.r.tx_tail(band(cxq.tx_next, cxq.ring_size - 1)) end -function Intel_avf:pull() - local lo = self.output.output +function IO:receive (lo) if lo == nil then return end local pkts = 0 - while band(self.rxdesc[self.rx_tail].write.status_err_type_len, 0x01) == 1 and pkts < engine.pull_npackets do - local p = self.rxqueue[self.rx_tail] - p.length = rshift(self.rxdesc[self.rx_tail].write.status_err_type_len, 38) + local cxq = self.cxq + while band(cxq.rxdesc[cxq.rx_tail].write.status_err_type_len, 0x01) == 1 and pkts < engine.pull_npackets do + local p = cxq.rxqueue[cxq.rx_tail] + p.length = rshift(cxq.rxdesc[cxq.rx_tail].write.status_err_type_len, 38) transmit(lo, p) local np = packet.allocate() - self.rxqueue[self.rx_tail] = np - self.rxdesc[self.rx_tail].read.address = tophysical(np.data) - self.rxdesc[self.rx_tail].write.status_err_type_len = 0 - self.rx_tail = band(self.rx_tail + 1, self.ring_buffer_size-1) + cxq.rxqueue[cxq.rx_tail] = np + cxq.rxdesc[cxq.rx_tail].read.address = tophysical(np.data) + cxq.rxdesc[cxq.rx_tail].write.status_err_type_len = 0 + cxq.rx_tail = band(cxq.rx_tail+1, cxq.ring_size-1) pkts = pkts + 1 end -- This avoids the queue being full / empty when HEAD=TAIL C.full_memory_barrier() - self.r.rx_tail(band(self.rx_tail - 1, self.ring_buffer_size - 1)) - - if self.sync_stats_throttle() then - self:sync_stats() - end + self.r.rx_tail(band(cxq.rx_tail-1, cxq.ring_size-1)) end -function Intel_avf:sync_stats () - if self.mbox.state == self.mbox.opcodes['VIRTCHNL_OP_GET_STATS'] then - self:mbox_r_stats('async') - end - if self.mbox.state == self.mbox.opcodes['VIRTCHNL_OP_RESET_VF'] then - self:mbox_s_stats() +function IO:push () + if self:activate() then + self:transmit(self.input.input) + self:deactivate() end end -function Intel_avf:flush_stats () - if self.mbox.state == self.mbox.opcodes['VIRTCHNL_OP_GET_STATS'] then - self:mbox_r_stats() +function IO:pull () + if self:activate() then + self:receive(self.output.output) + self:deactivate() end - self:mbox_s_stats() - self:mbox_r_stats() end -function Intel_avf:rxdrop () return counter.read(self.shm.rxdrop) end -function Intel_avf:txdrop () return counter.read(self.shm.txdrop) end - function Intel_avf:mbox_setup() local dlen = 4096 self.mbox = { @@ -520,9 +762,9 @@ function Intel_avf:mbox_setup() VIRTCHNL_OP_CONFIG_IRQ_MAP = 7, VIRTCHNL_OP_ENABLE_QUEUES = 8, VIRTCHNL_OP_DISABLE_QUEUES = 9, - -- VIRTCHNL_OP_ADD_ETH_ADDR = 10, + VIRTCHNL_OP_ADD_ETH_ADDR = 10, -- VIRTCHNL_OP_DEL_ETH_ADDR = 11, - -- VIRTCHNL_OP_ADD_VLAN = 12, + VIRTCHNL_OP_ADD_VLAN = 12, -- VIRTCHNL_OP_DEL_VLAN = 13, -- VIRTCHNL_OP_CONFIG_PROMISCUOUS_MODE = 14, VIRTCHNL_OP_GET_STATS = 15, @@ -548,7 +790,7 @@ function Intel_avf:mbox_sr(opcode, datalen) return self:mbox_recv(opcode) end -function Intel_avf:mbox_send(opcode, datalen) +function Intel_avf:mbox_send(opcode, datalen, timeout) assert(opcode == 'VIRTCHNL_OP_RESET_VF' or self.mbox.state == self.mbox.opcodes['VIRTCHNL_OP_RESET_VF']) @@ -582,9 +824,11 @@ function Intel_avf:mbox_send(opcode, datalen) self.r.VF_ATQT(self.mbox.next_send_idx) lib.waitfor(function() + assert(not (timeout and timeout()), "timeout") return self.r.VF_ATQT() == self.mbox.next_send_idx end) lib.waitfor(function() + assert(not (timeout and timeout()), "timeout") -- 1 == bits({ DescriptorDone = 0 }) -- 2 == bits({ Complete = 1 }) @@ -617,7 +861,7 @@ function Intel_avf:mbox_sr_caps() -- dpdk/drivers/net/avf/avf_vchnl.c local supported_caps = bits({ VIRTCHNL_VF_OFFLOAD_L2 = 0, - VIRTCHNL_VF_OFFLOAD_VLAN = 16, + VIRTCHNL_VF_OFFLOAD_VLAN = 16, -- NB: Could leave this bit off and let PF handle VLANs VIRTCHNL_VF_OFFLOAD_RX_POLLING = 17, VIRTCHNL_VF_OFFLOAD_RSS_PF = 19 }) @@ -685,12 +929,13 @@ function Intel_avf:mbox_recv(opcode, async) return ptr end -function Intel_avf:wait_for_vfgen_rstat() +function Intel_avf:wait_for_vfgen_rstat(timeout) -- Constant names stolen from DPDK drivers/net/avf/base/virtchnl.h -- Section 6.1 on page 51 local mask0 = bits( { VIRTCHNL_VFR_COMPLETED = 1 }) local mask1 = bits( { VIRTCHNL_VFR_VFACTIVE = 2 }) lib.waitfor(function () + assert(not (timeout and timeout()), "timeout") local v = self.r.VFGEN_RSTAT() return bit.band(mask0, v) == mask0 or bit.band(mask1, v) == mask1 end) @@ -698,31 +943,30 @@ end function Intel_avf:new(conf) local self = { - pciaddress = conf.pciaddr, + pciaddress = pci.qualified(conf.pciaddr), path = pci.path(conf.pciaddr), + vlan = conf.vlan, r = {}, ring_buffer_size = conf.ring_buffer_size, - - tx_next = 0, - tx_cand = 0, - tx_desc_free = conf.ring_buffer_size - 1, - qno = 0, - shm = { - rxbytes = {counter}, - rxpackets = {counter}, - rxmcast = {counter}, - rxbcast = {counter}, - rxdrop = {counter}, - rx_unknown_protocol = {counter}, - txbytes = {counter}, - txpackets = {counter}, - txmcast = {counter}, - txbcast = {counter}, - txdrop = {counter}, - txerrors = {counter} - }, sync_stats_throttle = lib.throttle(1) } + -- PCI device statistics + local frame = { + macaddr = {counter}, + rxbytes = {counter}, + rxpackets = {counter}, + rxmcast = {counter}, + rxbcast = {counter}, + rxdrop = {counter}, + rxerrors = {counter}, + txbytes = {counter}, + txpackets = {counter}, + txmcast = {counter}, + txbcast = {counter}, + txdrop = {counter}, + txerrors = {counter} + } + self.stats = shm.create_frame("pci/"..self.pciaddress, frame) -- pg79 /* number of descriptors, multiple of 32 */ assert(self.ring_buffer_size % 32 == 0, @@ -730,63 +974,153 @@ function Intel_avf:new(conf) self = setmetatable(self, { __index = Intel_avf }) self:supported_hardware() - self.base, self.fd = pci.map_pci_memory_unlocked(self.pciaddress, 0) - self:load_registers() + self.fd = pci.open_pci_resource_locked(self.pciaddress, 0) pci.unbind_device_from_linux(self.pciaddress) pci.set_bus_master(self.pciaddress, true) - pci.disable_bus_master_cleanup(self.pciaddress) + self.base = pci.map_pci_memory(self.fd) + self:load_registers() -- wait for the nic to be ready, setup the mailbox and then reset it -- that way it doesn't matter what state you where given the card - self:wait_for_vfgen_rstat() - self:mbox_setup() - self:stop() - - -- FIXME - -- I haven't worked out why the sleep is required but without it - -- self_mbox_set_version hangs indefinitely - --C.sleep(1) - -- See elaboration in Intel_avf:stop() + lib.waitfor(function () + return pcall(function () + self:wait_for_vfgen_rstat() + self:mbox_setup() + self:reset() -- reset can timeout + end) + end) -- setup the nic for real self:mbox_setup() self:mbox_sr_version() self:mbox_sr_caps() - self:mbox_s_rss() - self:init_tx_q() - self:init_rx_q() + self:mbox_sr_rss(conf.nqueues or 1) + if #conf.macs > 0 then + self:mbox_sr_add_mac(conf.macs) + end + if self.vlan then + self:mbox_sr_vlan() + end + + -- publish device MAC address to SHM + counter.set(self.stats.macaddr, self.mac.bits) + + -- Queue setup + self.cxqs = {} + for qno=0, (conf.nqueues or 1) - 1 do + self.cxqs[#self.cxqs+1] = self:init_cxq(qno) + end self:init_irq() - self:mbox_sr_irq() + self:mbox_sr_irq(conf.nqueues or 1) + + self:mbox_sr_q(self.cxqs) + self:mbox_sr_enable_q(#self.cxqs) + + for _, cxq in ipairs(self.cxqs) do + -- CXQ is now fully initialized & ready for attach. + assert(sync.cas(cxq.state, INIT, FREE)) + end + + if not conf.nqueues then + -- If number of queues it not explicitly configured default to + -- old behavior and configure this app to do I/O on a single queue. + self.io = IO:new{pciaddr=self.pciaddress, queue=0} + self.io.input, self.io.output = {}, {} + end - self:mbox_sr_q() - self:mbox_sr_enable_q() return self end function Intel_avf:link() - -- Alias SHM frame to canonical location. - if not shm.exists("pci/"..self.pciaddress) then - shm.alias("pci/"..self.pciaddress, "apps/"..self.appname) + if self.io then + self.io.input, self.io.output = self.input, self.output end end -function Intel_avf:stop() +function Intel_avf:push () + if self.io then + self.io:push() + end + if self.sync_stats_throttle() then + self:sync_stats() + end +end + +function Intel_avf:pull () + if self.io then + self.io:pull() + end + if self.sync_stats_throttle() then + self:sync_stats() + end +end + +function Intel_avf:sync_stats () + if self.mbox.state == self.mbox.opcodes['VIRTCHNL_OP_GET_STATS'] then + self:mbox_r_stats('async') + end + if self.mbox.state == self.mbox.opcodes['VIRTCHNL_OP_RESET_VF'] then + self:mbox_s_stats() + end +end + +function Intel_avf:flush_stats () + if self.mbox.state == self.mbox.opcodes['VIRTCHNL_OP_GET_STATS'] then + self:mbox_r_stats() + end + self:mbox_s_stats() + self:mbox_r_stats() +end + +function Intel_avf:rxdrop () return counter.read(self.stats.rxdrop) end +function Intel_avf:txdrop () return counter.read(self.stats.txdrop) end + +function Intel_avf:reset() -- From "Appendix A Virtual Channel Protocol": -- VF sends this request to PF with no parameters PF does NOT respond! VF -- driver must delay then poll VFGEN_RSTAT register until reset completion -- is indicated. The admin queue must be reinitialized after this operation. - self:mbox_send('VIRTCHNL_OP_RESET_VF', 0) + self:mbox_send('VIRTCHNL_OP_RESET_VF', 0, lib.timeout(1)) -- As per the above we (the VF driver) must "delay". Sadly, the spec does -- (as of this time / to my knowledge) not give further clues as to how to -- detect that the delay is sufficient. One second turned out to be not -- enough in some cases, two seconds has always worked so far. C.usleep(2e6) - self:wait_for_vfgen_rstat() - -- Unlink SHM alias. + self:wait_for_vfgen_rstat(lib.timeout(1)) +end + +function Intel_avf:stop() + self:reset() + pci.set_bus_master(self.pciaddress, false) + pci.close_pci_resource(self.fd, self.base) + -- If we have an embedded IO app, stop it. + if self.io then + self.io:stop() + end + -- Free packets remaining in TX/RX queues. + for _, cxq in ipairs(self.cxqs) do + local timeout = lib.timeout(3) + lib.waitfor(function () + assert(not timeout(), "Intel_avf: failed to free queue "..tonumber(cxq.qno)) + return sync.cas(cxq.state, FREE, DEAD) or sync.cas(cxq.state, IDLE, DEAD) + end) + self:free_cxq(cxq) + end + -- Unlink stats frame. shm.unlink("pci/"..self.pciaddress) end +function Intel_avf:report () + self:flush_stats() + for _, c in ipairs{ + 'rxbytes', 'rxpackets', 'rxmcast', 'rxbcast', 'rxdrop', 'rxdrop', + 'txbytes', 'txpackets', 'txmcast', 'txbcast', 'txdrop', 'txerrors' + } do + print((" %-20s %20s"):format(c, lib.comma_value(counter.read(self.stats[c])))) + end +end + function Intel_avf:init_irq() local intv = bit.lshift(20, 5) local v = bit.bor(bits({ ENABLE = 0, CLEARPBA = 1, ITR0 = 3, ITR1 = 4}), intv) @@ -795,31 +1129,67 @@ function Intel_avf:init_irq() self.r.VFINT_DYN_CTLN[0](v) end -function Intel_avf:mbox_sr_irq() +function Intel_avf:mbox_sr_irq(nqueues) local tt = self:mbox_send_buf(virtchnl_irq_map_info_ptr_t) tt.num_vectors = 1 tt.vsi_id = self.vsi_id tt.vector_id = 0 - tt.rxq_map = 1 + tt.rxq_map = 2^nqueues-1 -- disable interrupts for all queues self:mbox_sr("VIRTCHNL_OP_CONFIG_IRQ_MAP", ffi.sizeof(virtchnl_irq_map_info_t) + 12) end -function Intel_avf:mbox_sr_add_mac() +function Intel_avf:mbox_sr_add_mac(macs) -- pg81 - local tt = self:mbox_send_buf(virtchnl_ether_addr_ptr_t) + local tt = self:mbox_send_buf(virtchnl_ether_addr_list_ptr_t) tt.vsi = self.vsi_id - tt.num_elements = 1 - ffi.copy(tt.addr, self.mac, MAC_ADDR_BYTE_LEN) - self:mbox_sr('VIRTCHNL_OP_ADD_ETH_ADDR', ffi.sizeof(virtchnl_ether_addr_t) + 8) + tt.num_elements = #macs + for i, mac in ipairs(macs) do + ffi.copy(tt.list[i-1].addr, mac, MAC_ADDR_BYTE_LEN) + end + self:mbox_sr('VIRTCHNL_OP_ADD_ETH_ADDR', + ffi.sizeof(virtchnl_ether_addr_list_t) + + ffi.sizeof(virtchnl_ether_addr_t) * #macs) end -function Intel_avf:mbox_s_rss() - -- pg83 - -- Forcefully disable the NICs RSS features. Contrary to the spec, RSS - -- capabilites are turned on by default and need to be disabled (as least - -- under Linux/some NICs.) +function Intel_avf:mbox_sr_rss(nqueues) + -- Setup HENA local tt = self:mbox_send_buf(virtchnl_rss_hena_ptr_t) + if nqueues == 1 then + -- pg83 + -- Forcefully disable the NICs RSS features. Contrary to the spec, RSS + -- capabilites are turned on by default and need to be disabled (as least + -- under Linux/some NICs.) + tt.hena = 0 + else + -- Enable all + tt.hena = 0xffffffffffffffffULL + end self:mbox_sr('VIRTCHNL_OP_SET_RSS_HENA', ffi.sizeof(virtchnl_rss_hena_t)) + -- Set random RSS key + local tt = self:mbox_send_buf(virtchnl_rss_key_ptr_t) + tt.vsi_id = self.vsi_id + tt.key_len = self.rss_key_size + ffi.copy(tt.key, lib.random_bytes(self.rss_key_size), self.rss_key_size) + self:mbox_sr('VIRTCHNL_OP_CONFIG_RSS_KEY', + ffi.sizeof(virtchnl_rss_key_t) + self.rss_key_size-1) + -- Setup LUT + local tt = self:mbox_send_buf(virtchnl_rss_lut_ptr_t) + tt.vsi_id = self.vsi_id + tt.lut_entries = self.rss_lut_size + for i=0, self.rss_lut_size-1 do + tt.lut[i] = i % nqueues -- fill LUT with configured queues + end + self:mbox_sr('VIRTCHNL_OP_CONFIG_RSS_LUT', + ffi.sizeof(virtchnl_rss_lut_t) + self.rss_lut_size-1) +end + +function Intel_avf:mbox_sr_vlan() + local tt = self:mbox_send_buf(virtchnl_vlan_filter_list_ptr_t) + tt.vsi_id = self.vsi_id + tt.num_elements = 1 + tt.vlan_id[0] = self.vlan + self:mbox_sr('VIRTCHNL_OP_ADD_VLAN', + ffi.sizeof(virtchnl_vlan_filter_list_t) + ffi.sizeof("uint16_t")*1) end function Intel_avf:mbox_s_stats() @@ -835,18 +1205,18 @@ function Intel_avf:mbox_r_stats(async) local stats = ffi.cast(eth_stats_ptr_t, ret) local set = counter.set - set(self.shm.rxbytes, stats.rx_bytes) - set(self.shm.rxpackets, stats.rx_unicast) - set(self.shm.rxmcast, stats.rx_multicast) - set(self.shm.rxbcast, stats.rx_broadcast) - set(self.shm.rxdrop, stats.rx_discards) - set(self.shm.rx_unknown_protocol, stats.rx_unknown_protocol) - - set(self.shm.txbytes, stats.tx_bytes) - set(self.shm.txpackets, stats.tx_unicast) - set(self.shm.txmcast, stats.tx_multicast) - set(self.shm.txbcast, stats.tx_broadcast) - set(self.shm.txdrop, stats.tx_discards) - set(self.shm.txerrors, stats.tx_errors) + set(self.stats.rxbytes, stats.rx_bytes) + set(self.stats.rxpackets, stats.rx_unicast) + set(self.stats.rxmcast, stats.rx_multicast) + set(self.stats.rxbcast, stats.rx_broadcast) + set(self.stats.rxdrop, stats.rx_discards) + set(self.stats.rxdrop, stats.rx_unknown_protocol) + + set(self.stats.txbytes, stats.tx_bytes) + set(self.stats.txpackets, stats.tx_unicast) + set(self.stats.txmcast, stats.tx_multicast) + set(self.stats.txbcast, stats.tx_broadcast) + set(self.stats.txdrop, stats.tx_discards) + set(self.stats.txerrors, stats.tx_errors) end diff --git a/src/apps/intel_avf/tests/back2back/multicast.snabb b/src/apps/intel_avf/tests/back2back/multicast.snabb new file mode 100755 index 0000000000..42054c1831 --- /dev/null +++ b/src/apps/intel_avf/tests/back2back/multicast.snabb @@ -0,0 +1,116 @@ +#!../../../../snabb snsh +local vf0 = os.getenv("SNABB_AVF_PF0_VF0") +local vf1 = os.getenv("SNABB_AVF_PF1_VF0") or os.getenv("SNABB_AVF_PF0_VF1") + +assert(vf0 ~= nil, "SNABB_AVF_PF0_VF0 is nil") +assert(vf1 ~= nil, "SNABB_AVF_PF1_VF0 is nil") + +local src = os.getenv("SNABB_AVF_PF0_SRC0") +local dst = os.getenv("SNABB_AVF_PF1_DST0") or os.getenv("SNABB_AVF_PF0_DST1") + +assert(src ~= nil, "SNABB_AVF_SRC0 is nil") +assert(dst ~= nil, "SNABB_AVF_DST1 is nil") + +local packet_count = 1001 + +local basic = require("apps.basic.basic_apps") +local intel_avf = require("apps.intel_avf.intel_avf") +local match = require("apps.test.match") +local npackets = require("apps.test.npackets") +local synth = require("apps.test.synth") +local counter = require("core.counter") + +-- Broadcast + +local c = config.new() +config.app(c, "synth", synth.Synth, { + sizes = {64,67,128,133,192,256,384,512,777,1024}, + src=src, + dst="ff:ff:ff:ff:ff:ff", + random_payload = true +} ) +config.app(c, "tee", basic.Tee) +config.app(c, "match", match.Match) + +config.app(c, "npackets", npackets.Npackets, { npackets = packet_count }) +config.app(c, "nic0", intel_avf.Intel_avf, { pciaddr = vf0 }) +config.app(c, "nic1", intel_avf.Intel_avf, { pciaddr = vf1 }) + +config.link(c, "synth.output -> npackets.input") +config.link(c, "npackets.output -> tee.input") +config.link(c, "tee.output1 -> nic0.input") +config.link(c, "nic1.output -> match.rx") +config.link(c, "tee.output2 -> match.comparator") + +engine.configure(c) + +local n0 = engine.app_table['nic0'] +local n1 = engine.app_table['nic1'] +n0:flush_stats() +n1:flush_stats() + +engine.main({duration = 1, report = false}) +engine.report_links() +engine.report_apps() + +function rx(l1, l2) + return counter.read(engine.link_table[l1 .. " -> " .. l2].stats.rxpackets) +end +function assert_eq(a,b,msg) + local an = tonumber(a) + local bn = tonumber(b) + assert(an == bn, msg .. " " .. an .. " ~= " .. bn) +end + +local s = rx("tee.output1", "nic0.input") +local r = rx("nic1.output", "match.rx") +assert_eq(s, r, "packets_sr_1") + +n0:flush_stats() +n1:flush_stats() +assert_eq(counter.read(n0.stats.txbcast), counter.read(n1.stats.rxbcast), "mxbox_sr_stats_1") +assert_eq(counter.read(n0.stats.txbcast), packet_count, "mbox_sr_stats_2") + +local m = engine.app_table['match'] +assert(#m:errors() == 0, "Corrupt packets.") + +engine.configure(config.new()) + +-- Multicast + +local ethernet = require("lib.protocol.ethernet") +local ipv6 = require("lib.protocol.ipv6") + +local v6_mcast = ipv6:solicited_node_mcast(ipv6:pton("fd10::1")) +local mac_mcast = ethernet:ipv6_mcast(v6_mcast) + +config.app(c, "synth", synth.Synth, { + sizes = {64,67,128,133,192,256,384,512,777,1024}, + src=src, + dst=ethernet:ntop(mac_mcast), + random_payload = true +} ) +config.app(c, "nic1", intel_avf.Intel_avf, { pciaddr = vf1, macs = {mac_mcast} }) + +engine.configure(c) + +local n0 = engine.app_table['nic0'] +local n1 = engine.app_table['nic1'] +n0:flush_stats() +n1:flush_stats() + +engine.main({duration = 1, report = false}) +engine.report_links() +engine.report_apps() + +local s = rx("tee.output1", "nic0.input") +local r = rx("nic1.output", "match.rx") +assert_eq(s, r, "packets_sr_1") + +n0:flush_stats() +n1:flush_stats() +assert_eq(counter.read(n0.stats.txmcast), counter.read(n1.stats.rxmcast), "mxbox_sr_stats_1") +assert_eq(counter.read(n0.stats.txmcast), packet_count, "mbox_sr_stats_2") + +local m = engine.app_table['match'] +assert(#m:errors() == 0, "Corrupt packets.") \ No newline at end of file diff --git a/src/apps/intel_avf/tests/back2back/rss.snabb b/src/apps/intel_avf/tests/back2back/rss.snabb new file mode 100755 index 0000000000..e2ef6b7c06 --- /dev/null +++ b/src/apps/intel_avf/tests/back2back/rss.snabb @@ -0,0 +1,125 @@ +#!../../../../snabb snsh +local vf0 = os.getenv("SNABB_AVF_PF0_VF0") +local vf1 = os.getenv("SNABB_AVF_PF1_VF0") or os.getenv("SNABB_AVF_PF0_VF1") + +assert(vf0 ~= nil, "SNABB_AVF_PF0_VF0 is nil") +assert(vf1 ~= nil, "SNABB_AVF_PF1_VF0 is nil") + +local src = os.getenv("SNABB_AVF_PF0_SRC0") +local dst = os.getenv("SNABB_AVF_PF1_DST0") or os.getenv("SNABB_AVF_PF0_DST1") + +assert(src ~= nil, "SNABB_AVF_SRC0 is nil") +assert(dst ~= nil, "SNABB_AVF_DST1 is nil") + +local basic = require("apps.basic.basic_apps") +local intel_avf = require("apps.intel_avf.intel_avf") +local synth = require("apps.test.synth") +local counter = require("core.counter") + +-- Test RSS queues +local nqueues = 4 +local c = config.new() +local sizes = {64,128,192,256,384,512,1024,1500} +local function random_v4_packets (conf) + local lib = require("core.lib") + local ethernet = require("lib.protocol.ethernet") + local ipv4 = require("lib.protocol.ipv4") + local eth = ethernet:new{src = ethernet:pton(conf.src), + dst = ethernet:pton(conf.dst), + type = 0x0800} + local packets = {} + for _, size in ipairs(conf.sizes) do + for _=1,10 do + local ip = ipv4:new{src=lib.random_bytes(4), + dst=lib.random_bytes(4)} + ip:total_length(size - eth:sizeof()) + local payload_length = ip:total_length() - ip:sizeof() + local p = packet.allocate() + packet.append(p, eth:header(), eth:sizeof()) + packet.append(p, ip:header(), ip:sizeof()) + packet.append(p, lib.random_bytes(payload_length), payload_length) + table.insert(packets, p) + end + end + return packets +end + +config.app(c, "synth0", synth.Synth, { + packets = random_v4_packets{ + sizes=sizes, + src=src, + dst=dst + } +}) +config.app(c, "synth1", synth.Synth, { + packets = random_v4_packets{ + sizes=sizes, + src=dst, + dst=src + } +}) +config.app(c, "nic0", intel_avf.Intel_avf, { pciaddr = vf0, nqueues = nqueues }) +config.app(c, "nic1", intel_avf.Intel_avf, { pciaddr = vf1, nqueues = nqueues }) +config.app(c, "sink", basic.Sink) +for qno=0, nqueues-1 do + config.app(c, "nic0_io"..qno, intel_avf.IO, {pciaddr = vf0, queue = qno}) + config.link(c, "synth0.output"..qno.. " -> nic0_io"..qno..".input") + config.link(c, "nic0_io"..qno..".output -> sink.input_nic0_io"..qno) + config.app(c, "nic1_io"..qno, intel_avf.IO, {pciaddr = vf1, queue = qno}) + config.link(c, "synth1.output"..qno.. " -> nic1_io"..qno..".input") + config.link(c, "nic1_io"..qno..".output -> sink.input_nic1_io"..qno) +end +engine.configure(c) +engine.main({ duration = 1, no_report = true }) +engine.report_links() +engine.report_apps() + +local received = {} +for nic=0,1 do + for qno=0, nqueues-1 do + local output = engine.app_table["nic"..nic.."_io"..qno].output.output + received[#received+1] = tonumber(counter.read(output.stats.txpackets)) + end +end + +local function mean (values) + local sum = 0 + for _, value in ipairs(values) do + sum = sum + value + end + return sum / #values +end + +local function stdev (values) + local avg = mean(values) + local var = {} + for _, value in ipairs(values) do + var[#var+1] = (value-avg)^2 + end + return math.sqrt(mean(var)) +end + +local rx_mean, rx_sd = mean(received), stdev(received) +print("RX AVG", rx_mean, "SD", rx_sd) +assert(rx_sd/rx_mean <= 0.1, "SD exceeds 10% of mean (queues should receive roughly equal numbers of packets)") + +-- Exercise VLANs + +config.app(c, "nic0", intel_avf.Intel_avf, { pciaddr = vf0, nqueues = nqueues, vlan = 42 }) +config.app(c, "nic1", intel_avf.Intel_avf, { pciaddr = vf1, nqueues = nqueues, vlan = 42 }) +engine.configure(c) +engine.main({ duration = 1, no_report = true }) +engine.report_links() +engine.report_apps() + +local received = {} +for nic=0,1 do + for qno=0, nqueues-1 do + local output = engine.app_table["nic"..nic.."_io"..qno].output.output + received[#received+1] = tonumber(counter.read(output.stats.txpackets)) + end +end + +local rx_mean, rx_sd = mean(received), stdev(received) +print("RX AVG", rx_mean, "SD", rx_sd) +assert(rx_sd/rx_mean <= 0.1, "SD exceeds 10% of mean (queues should receive roughly equal numbers of packets)") \ No newline at end of file diff --git a/src/apps/intel_avf/tests/back2back/test.snabb b/src/apps/intel_avf/tests/back2back/test.snabb index f4ec12c519..19014c239e 100755 --- a/src/apps/intel_avf/tests/back2back/test.snabb +++ b/src/apps/intel_avf/tests/back2back/test.snabb @@ -68,8 +68,8 @@ assert_eq(s, r, "packets_sr_1") n0:flush_stats() n1:flush_stats() -assert_eq(counter.read(n0.shm.txpackets), counter.read(n1.shm.rxpackets), "mxbox_sr_stats_1") -assert_eq(counter.read(n0.shm.txpackets), packet_count, "mbox_sr_stats_2") +assert_eq(counter.read(n0.stats.txpackets), counter.read(n1.stats.rxpackets), "mxbox_sr_stats_1") +assert_eq(counter.read(n0.stats.txpackets), packet_count, "mbox_sr_stats_2") local m = engine.app_table['match'] assert(#m:errors() == 0, "Corrupt packets.") @@ -96,7 +96,9 @@ while true do engine.main({ duration = 1, no_report = true }) end engine.report_links() +engine.report_apps() assert(rx("nic1.output", "sink.input") >= tosend, "packets received do not match packets sent") + engine.stop() main.exit(0) diff --git a/src/apps/intel_avf/tests/back2back/vlan.snabb b/src/apps/intel_avf/tests/back2back/vlan.snabb new file mode 100755 index 0000000000..b27b68d2c6 --- /dev/null +++ b/src/apps/intel_avf/tests/back2back/vlan.snabb @@ -0,0 +1,107 @@ +#!../../../../snabb snsh +local vf0 = os.getenv("SNABB_AVF_PF0_VF0") +local vf1 = os.getenv("SNABB_AVF_PF1_VF0") or os.getenv("SNABB_AVF_PF0_VF1") + +assert(vf0 ~= nil, "SNABB_AVF_PF0_VF0 is nil") +assert(vf1 ~= nil, "SNABB_AVF_PF1_VF0 is nil") + +local src = os.getenv("SNABB_AVF_PF0_SRC0") +local dst = os.getenv("SNABB_AVF_PF1_DST0") or os.getenv("SNABB_AVF_PF0_DST1") + +assert(src ~= nil, "SNABB_AVF_SRC0 is nil") +assert(dst ~= nil, "SNABB_AVF_DST1 is nil") + +local packet_count = 1001 + +local basic = require("apps.basic.basic_apps") +local intel_avf = require("apps.intel_avf.intel_avf") +local match = require("apps.test.match") +local npackets = require("apps.test.npackets") +local synth = require("apps.test.synth") +local counter = require("core.counter") + +local c = config.new() +config.app(c, "synth", synth.Synth, { + sizes = {64,67,128,133,192,256,384,512,777,1024}, + src=src, + dst=dst, + random_payload = true +} ) +config.app(c, "tee", basic.Tee) +config.app(c, "match", match.Match) + +config.app(c, "npackets", npackets.Npackets, { npackets = packet_count }) +config.app(c, "nic0", intel_avf.Intel_avf, { pciaddr = vf0, vlan = 1 }) +config.app(c, "nic1", intel_avf.Intel_avf, { pciaddr = vf1, vlan = 1 }) + +config.link(c, "synth.output -> npackets.input") +config.link(c, "npackets.output -> tee.input") +config.link(c, "tee.output1 -> nic0.input") +config.link(c, "nic1.output -> match.rx") +config.link(c, "tee.output2 -> match.comparator") + +engine.configure(c) + +local n0 = engine.app_table['nic0'] +local n1 = engine.app_table['nic1'] +n0:flush_stats() +n1:flush_stats() + +engine.main({duration = 1, report = false}) +engine.report_links() +engine.report_apps() + +function rx(l1, l2) + return counter.read(engine.link_table[l1 .. " -> " .. l2].stats.rxpackets) +end +function assert_eq(a,b,msg) + local an = tonumber(a) + local bn = tonumber(b) + assert(an == bn, msg .. " " .. an .. " ~= " .. bn) +end + +local s = rx("tee.output1", "nic0.input") +local r = rx("nic1.output", "match.rx") +assert_eq(s, r, "packets_sr_1") + +n0:flush_stats() +n1:flush_stats() +assert_eq(counter.read(n0.stats.txpackets), counter.read(n1.stats.rxpackets), "mxbox_sr_stats_1") +assert_eq(counter.read(n0.stats.txpackets), packet_count, "mbox_sr_stats_2") + +local m = engine.app_table['match'] +assert(#m:errors() == 0, "Corrupt packets.") + +-- Check VLAN filtering + +local c = config.new() +config.app(c, "synth", synth.Synth, { + sizes = {64,67,128,133,192,256,384,512,777,1024}, + src=src, + dst=dst, + random_payload = true +} ) + +config.app(c, "nic0", intel_avf.Intel_avf, { pciaddr = vf0, vlan = 1 }) +config.app(c, "nic1", intel_avf.Intel_avf, { pciaddr = vf1, vlan = 2 }) +config.app(c, "sink", basic.Sink) + +config.link(c, "synth.output -> nic0.input") +config.link(c, "nic1.output -> sink.input") + +engine.configure(c) + +local n0 = engine.app_table['nic0'] +local n1 = engine.app_table['nic1'] +n0:flush_stats() +n1:flush_stats() + +engine.main({duration = 1, report = false}) +engine.report_links() +engine.report_apps() + +n0:flush_stats() +n1:flush_stats() + +assert(counter.read(n0.stats.txpackets) > 0, "No packets sent") +assert(counter.read(n1.stats.rxpackets) == 0, "Should not receive from other VLAN") diff --git a/src/apps/ipv4/arp.lua b/src/apps/ipv4/arp.lua index 6c992db150..50383254b2 100644 --- a/src/apps/ipv4/arp.lua +++ b/src/apps/ipv4/arp.lua @@ -135,6 +135,14 @@ end ARP = {} ARP.shm = { ["next-hop-macaddr-v4"] = {counter}, + ["in-arp-request-bytes"] = {counter}, + ["in-arp-request-packets"] = {counter}, + ["out-arp-request-bytes"] = {counter}, + ["out-arp-request-packets"] = {counter}, + ["in-arp-reply-bytes"] = {counter}, + ["in-arp-reply-packets"] = {counter}, + ["out-arp-reply-bytes"] = {counter}, + ["out-arp-reply-packets"] = {counter}, } local arp_config_params = { -- Source MAC address will default to a random address. @@ -186,6 +194,8 @@ function ARP:maybe_send_arp_request (output) end function ARP:send_arp_request (output) + counter.add(self.shm["out-arp-request-bytes"], self.arp_request_pkt.length) + counter.add(self.shm["out-arp-request-packets"]) transmit(output, packet.clone(self.arp_request_pkt)) end @@ -239,11 +249,18 @@ function ARP:push() h.arp.hlen ~= 6 or h.arp.plen ~= 4) then -- Ignore invalid packet. elseif ntohs(h.arp.oper) == arp_oper_request then + counter.add(self.shm["in-arp-request-bytes"], p.length) + counter.add(self.shm["in-arp-request-packets"]) if self.self_ip and ipv4_eq(h.arp.tpa, self.self_ip) then - transmit(osouth, make_arp_reply(self.self_mac, self.self_ip, - h.arp.sha, h.arp.spa)) + local reply = make_arp_reply(self.self_mac, self.self_ip, + h.arp.sha, h.arp.spa) + counter.add(self.shm["out-arp-reply-bytes"], reply.length) + counter.add(self.shm["out-arp-reply-packets"]) + transmit(osouth, reply) end elseif ntohs(h.arp.oper) == arp_oper_reply then + counter.add(self.shm["in-arp-reply-bytes"], p.length) + counter.add(self.shm["in-arp-reply-packets"]) if self.next_ip and ipv4_eq(h.arp.spa, self.next_ip) then self:arp_resolved(self.next_ip, copy_mac(h.arp.sha), 'remote') end @@ -276,9 +293,14 @@ end function selftest() print('selftest: arp') - local arp = ARP:new({ self_ip = ipv4:pton('1.2.3.4'), - next_ip = ipv4:pton('5.6.7.8'), - shared_next_mac_key = "foo" }) + local c = config.new() + config.app(c, 'arp', ARP, { + self_ip = ipv4:pton('1.2.3.4'), + next_ip = ipv4:pton('5.6.7.8'), + shared_next_mac_key = "foo" + }) + engine.configure(c) + local arp = engine.app_table.arp arp.input = { south=link.new('south in'), north=link.new('north in') } arp.output = { south=link.new('south out'), north=link.new('north out') } diff --git a/src/apps/ipv4/echo.lua b/src/apps/ipv4/echo.lua index 9d422a81ea..e025c6ffec 100644 --- a/src/apps/ipv4/echo.lua +++ b/src/apps/ipv4/echo.lua @@ -73,7 +73,14 @@ local function ipv4_header_length(h) return bit.band(h.version_and_ihl, ipv4_ihl_mask) * 4 end -ICMPEcho = {} +ICMPEcho = { + shm = { + ['in-icmpv4-echo-bytes'] = {counter}, + ['in-icmpv4-echo-packets'] = {counter}, + ['out-icmpv4-echo-bytes'] = {counter}, + ['out-icmpv4-echo-packets'] = {counter}, + } +} function ICMPEcho:new(conf) local addresses = {} @@ -139,6 +146,12 @@ function ICMPEcho:respond_to_echo_request(pkt) ipsum(out.data + ether_header_len + ipv4_header_len, out.length - ether_header_len - ipv4_header_len, 0)) + -- Update counters + counter.add(self.shm['in-icmpv4-echo-bytes'], pkt.length) + counter.add(self.shm['in-icmpv4-echo-packets']) + counter.add(self.shm['out-icmpv4-echo-bytes'], out.length) + counter.add(self.shm['out-icmpv4-echo-packets']) + link.transmit(self.output.south, out) return true diff --git a/src/apps/ipv4/fragment.lua b/src/apps/ipv4/fragment.lua index 3aced06e1d..4e07271bac 100644 --- a/src/apps/ipv4/fragment.lua +++ b/src/apps/ipv4/fragment.lua @@ -64,7 +64,7 @@ end local function ipv4_packet_has_valid_length(h, len) if len < ffi.sizeof(ether_ipv4_header_t) then return false end if ipv4_header_length(h.ipv4) < 20 then return false end - return ntohs(h.ipv4.total_length) == len - ether_header_len + return ntohs(h.ipv4.total_length) <= len - ether_header_len end Fragmenter = {} diff --git a/src/apps/ipv4/reassemble.lua b/src/apps/ipv4/reassemble.lua index 87b20b05b9..fa01f7b3be 100644 --- a/src/apps/ipv4/reassemble.lua +++ b/src/apps/ipv4/reassemble.lua @@ -75,7 +75,7 @@ local function ipv4_packet_has_valid_length(h, len) if len < ffi.sizeof(ether_ipv4_header_t) then return false end local ihl = bit.band(h.ipv4.version_and_ihl, ipv4_ihl_mask) if ihl < 5 then return false end - return ntohs(h.ipv4.total_length) == len - ether_header_len + return ntohs(h.ipv4.total_length) <= len - ether_header_len end -- IPv4 requires recalculating an embedded checksum. diff --git a/src/apps/ipv6/echo.lua b/src/apps/ipv6/echo.lua index 4817b63f18..bfc6262114 100644 --- a/src/apps/ipv6/echo.lua +++ b/src/apps/ipv6/echo.lua @@ -66,7 +66,14 @@ local icmp_header_ptr_t = ffi.typeof('$*', icmp_header_t) local function ipv6_equals(a, b) return ffi.C.memcmp(a, b, 16) == 0 end -ICMPEcho = {} +ICMPEcho = { + shm = { + ['in-icmpv6-echo-bytes'] = {counter}, + ['in-icmpv6-echo-packets'] = {counter}, + ['out-icmpv6-echo-bytes'] = {counter}, + ['out-icmpv6-echo-packets'] = {counter}, + } +} function ICMPEcho:new(conf) local addresses = {} @@ -131,6 +138,12 @@ function ICMPEcho:respond_to_echo_request(pkt) ffi.sizeof(ipv6_pseudo_header_t), 0)))) + -- Update counters + counter.add(self.shm['in-icmpv6-echo-bytes'], pkt.length) + counter.add(self.shm['in-icmpv6-echo-packets']) + counter.add(self.shm['out-icmpv6-echo-bytes'], out.length) + counter.add(self.shm['out-icmpv6-echo-packets']) + link.transmit(self.output.south, out) return true diff --git a/src/apps/ipv6/fragment.lua b/src/apps/ipv6/fragment.lua index ec8cc14e95..eedca13cf2 100644 --- a/src/apps/ipv6/fragment.lua +++ b/src/apps/ipv6/fragment.lua @@ -80,7 +80,7 @@ local fragment_header_ptr_t = ffi.typeof('$*', fragment_header_t) -- Precondition: packet already has IPv6 ethertype. local function ipv6_packet_has_valid_length(h, len) if len < ether_ipv6_header_len then return false end - return ntohs(h.ipv6.payload_length) == len - ether_ipv6_header_len + return ntohs(h.ipv6.payload_length) <= len - ether_ipv6_header_len end Fragmenter = {} diff --git a/src/apps/ipv6/reassemble.lua b/src/apps/ipv6/reassemble.lua index a3467c18b8..e2c418fe7f 100644 --- a/src/apps/ipv6/reassemble.lua +++ b/src/apps/ipv6/reassemble.lua @@ -90,7 +90,7 @@ local function ipv6_packet_has_valid_length(h, len) -- The minimum Ethernet frame size is 60 bytes (without FCS). Those -- frames may contain padding bytes. local payload_length = ntohs(h.ipv6.payload_length) - return payload_length <= 60 or payload_length == len - ether_ipv6_header_len + return payload_length <= len - ether_ipv6_header_len end local function swap(array, i, j) diff --git a/src/apps/lwaftr/binding_table.lua b/src/apps/lwaftr/binding_table.lua index 977606f4b5..b2b0c77f6f 100644 --- a/src/apps/lwaftr/binding_table.lua +++ b/src/apps/lwaftr/binding_table.lua @@ -279,7 +279,7 @@ function selftest() local mem = require("lib.stream.mem") local yang = require('lib.yang.yang') local data = require('lib.yang.data') - local schema = yang.load_schema_by_name('snabb-softwire-v2') + local schema = yang.load_schema_by_name('snabb-softwire-v3') local grammar = data.config_grammar_from_schema(schema) local subgrammar = assert(grammar.members['softwire-config']) local subgrammar = assert(subgrammar.members['binding-table']) diff --git a/src/apps/lwaftr/lwaftr.lua b/src/apps/lwaftr/lwaftr.lua index 8fd945c129..e76b8faec3 100644 --- a/src/apps/lwaftr/lwaftr.lua +++ b/src/apps/lwaftr/lwaftr.lua @@ -339,22 +339,7 @@ local function drop(pkt) packet.free(pkt) end -local function select_instance(conf) - local function table_merge(t1, t2) - local ret = {} - for k,v in pairs(t1) do ret[k] = v end - for k,v in pairs(t2) do ret[k] = v end - return ret - end - local device, id, queue = lwutil.parse_instance(conf) - conf.softwire_config.external_interface = table_merge( - conf.softwire_config.external_interface, queue.external_interface) - conf.softwire_config.internal_interface = table_merge( - conf.softwire_config.internal_interface, queue.internal_interface) - return conf -end - -LwAftr = { yang_schema = 'snabb-softwire-v2' } +LwAftr = { yang_schema = 'snabb-softwire-v3' } -- Fields: -- - direction: "in", "out", "hairpin", "drop"; -- If "direction" is "drop": @@ -407,10 +392,10 @@ LwAftr.shm = { ["in-ipv4-packets"] = {counter}, ["in-ipv6-bytes"] = {counter}, ["in-ipv6-packets"] = {counter}, - ["out-icmpv4-bytes"] = {counter}, - ["out-icmpv4-packets"] = {counter}, - ["out-icmpv6-bytes"] = {counter}, - ["out-icmpv6-packets"] = {counter}, + ["out-icmpv4-error-bytes"] = {counter}, + ["out-icmpv4-error-packets"] = {counter}, + ["out-icmpv6-error-bytes"] = {counter}, + ["out-icmpv6-error-packets"] = {counter}, ["out-ipv4-bytes"] = {counter}, ["out-ipv4-packets"] = {counter}, ["out-ipv6-bytes"] = {counter}, @@ -420,9 +405,8 @@ LwAftr.shm = { function LwAftr:new(conf) if conf.debug then debug = true end local o = setmetatable({}, {__index=LwAftr}) - conf = select_instance(conf).softwire_config + conf = lwutil.merge_instance(conf).softwire_config o.conf = conf - o.binding_table = bt.load(conf.binding_table) o.inet_lookup_queue = bt.BTLookupQueue.new(o.binding_table) o.hairpin_lookup_queue = bt.BTLookupQueue.new(o.binding_table) @@ -465,7 +449,7 @@ end -- The following two methods are called by lib.ptree.worker in reaction -- to binding table changes, via --- lib/ptree/support/snabb-softwire-v2.lua. +-- lib/ptree/support/snabb-softwire-v3.lua. function LwAftr:add_softwire_entry(entry_blob) self.binding_table:add_softwire_entry(entry_blob) end @@ -508,8 +492,8 @@ function LwAftr:transmit_icmpv6_reply (pkt) -- Send packet if limit not reached. if self.icmpv6_error_count < rate_limiting.packets then self.icmpv6_error_count = self.icmpv6_error_count + 1 - counter.add(self.shm["out-icmpv6-bytes"], pkt.length) - counter.add(self.shm["out-icmpv6-packets"]) + counter.add(self.shm["out-icmpv6-error-bytes"], pkt.length) + counter.add(self.shm["out-icmpv6-error-packets"]) counter.add(self.shm["out-ipv6-bytes"], pkt.length) counter.add(self.shm["out-ipv6-packets"]) return transmit(self.o6, pkt) @@ -552,8 +536,8 @@ function LwAftr:transmit_icmpv4_reply(pkt, orig_pkt, orig_pkt_link) -- Send packet if limit not reached. if self.icmpv4_error_count < rate_limiting.packets then self.icmpv4_error_count = self.icmpv4_error_count + 1 - counter.add(self.shm["out-icmpv4-bytes"], pkt.length) - counter.add(self.shm["out-icmpv4-packets"]) + counter.add(self.shm["out-icmpv4-error-bytes"], pkt.length) + counter.add(self.shm["out-icmpv4-error-packets"]) -- Only locally generated error packets are handled here. We transmit -- them right away, instead of calling transmit_ipv4, because they are -- never hairpinned and should not be counted by the "out-ipv4" counter. diff --git a/src/apps/lwaftr/lwutil.lua b/src/apps/lwaftr/lwutil.lua index a4cd412190..5556f2f9f8 100644 --- a/src/apps/lwaftr/lwutil.lua +++ b/src/apps/lwaftr/lwutil.lua @@ -7,6 +7,7 @@ local bit = require("bit") local ffi = require("ffi") local lib = require("core.lib") local cltable = require("lib.cltable") +local binary = require("lib.yang.binary") local band = bit.band local cast = ffi.cast @@ -21,24 +22,82 @@ local ntohs = lib.ntohs -- Return device PCI address, queue ID, and queue configuration. function parse_instance(conf) - local device, instance - for k, v in pairs(conf.softwire_config.instance) do - assert(device == nil, "configuration has more than one instance") - device, instance = k, v + if conf.worker_config then + local device = conf.worker_config.device + local id = conf.worker_config.queue_id + local queue = conf.softwire_config.instance[device].queue[id] + return device, id, queue + else + local device, id + for dev in pairs(conf.softwire_config.instance) do + assert(not device, "Config contains more than one device") + device = dev + end + for queue in pairs(conf.softwire_config.instance[device].queue) do + assert(not id, "Config contains more than one queue") + id = queue + end + return device, id, conf.softwire_config.instance[device].queue[id] end - assert(device ~= nil, "configuration has no instance") - local id, queue - for k, v in pairs(instance.queue) do - assert(id == nil, "configuration has more than one RSS queue") - id, queue = k, v +end + +function is_on_a_stick(conf, device) + local instance = conf.softwire_config.instance[device] + if not instance.external_device then return true end + return device == instance.external_device +end + +function is_lowest_queue(conf) + local device, id = parse_instance(conf) + for n in pairs(conf.softwire_config.instance[device].queue) do + if id > n then return false end end - assert(id ~= nil, "configuration has no RSS queues") - return device, id, queue + return true end -function is_on_a_stick(device, queue) - if not queue.external_interface.device and device then return true end - return device == queue.external_interface.device +function num_queues(conf) + local n = 0 + local device, id = parse_instance(conf) + for _ in pairs(conf.softwire_config.instance[device].queue) do + n = n + 1 + end + return n +end + +function select_instance(conf) + local copier = binary.config_copier_for_schema_by_name('snabb-softwire-v3') + local device, id = parse_instance(conf) + local copy = copier(conf)() + local instance = copy.softwire_config.instance + for other_device, queues in pairs(conf.softwire_config.instance) do + if other_device ~= device then + instance[other_device] = nil + else + for other_id, _ in pairs(queues.queue) do + if other_id ~= id then + instance[device].queue[other_id] = nil + end + end + end + end + return copy +end + +function merge_instance (conf) + local function table_merge(t1, t2) + local ret = {} + for k,v in pairs(t1) do ret[k] = v end + for k,v in pairs(t2) do ret[k] = v end + return ret + end + local copier = binary.config_copier_for_schema_by_name('snabb-softwire-v3') + local copy = copier(conf)() + local _, _, queue = parse_instance(conf) + copy.softwire_config.external_interface = table_merge( + conf.softwire_config.external_interface, queue.external_interface) + copy.softwire_config.internal_interface = table_merge( + conf.softwire_config.internal_interface, queue.internal_interface) + return copy end function get_ihl_from_offset(pkt, offset) diff --git a/src/apps/lwaftr/ndp.lua b/src/apps/lwaftr/ndp.lua index 7e8b48f502..94617738a6 100644 --- a/src/apps/lwaftr/ndp.lua +++ b/src/apps/lwaftr/ndp.lua @@ -220,18 +220,17 @@ local function make_na_packet(src_mac, dst_mac, src_ip, dst_ip, is_router) end -- Solicit a neighbor's address. -local function make_ns_packet(src_mac, src_ip, dst_ip) +local function make_ns_packet(src_mac, src_ip, dst_mac, dst_ip, target_ip) local message = ns_header_t() message.flags = 0 - message.target_ip = dst_ip + message.target_ip = target_ip local option = ether_option_header_t() option.header.type = option_source_link_layer_address option.header.length = 1 -- One 8-byte unit. option.addr = src_mac - local broadcast_mac = ethernet:pton("ff:ff:ff:ff:ff:ff") - return make_ndp_packet(src_mac, broadcast_mac, src_ip, dst_ip, icmpv6_ns, + return make_ndp_packet(src_mac, dst_mac, src_ip, dst_ip, icmpv6_ns, message, option) end @@ -268,6 +267,14 @@ end NDP = {} NDP.shm = { ["next-hop-macaddr-v6"] = {counter}, + ["in-ndp-ns-bytes"] = {counter}, + ["in-ndp-ns-packets"] = {counter}, + ["out-ndp-ns-bytes"] = {counter}, + ["out-ndp-ns-packets"] = {counter}, + ["in-ndp-na-bytes"] = {counter}, + ["in-ndp-na-packets"] = {counter}, + ["out-ndp-na-bytes"] = {counter}, + ["out-ndp-na-packets"] = {counter}, } local ndp_config_params = { -- Source MAC address will default to a random address. @@ -298,6 +305,21 @@ function NDP:new(conf) assert(o.next_ip, 'NDP needs next-hop IPv6 address to learn next-hop MAC') self.ns_interval = 3 -- Send a new NS every three seconds. end + if o.next_ip then + -- Construct Solicited-Node multicast address + -- https://datatracker.ietf.org/doc/html/rfc4861#section-2.3 + o.solicited_node_mcast = ipv6:pton("ff02::1:ff00:0") -- /104 + o.solicited_node_mcast[13] = o.next_ip[13] + o.solicited_node_mcast[14] = o.next_ip[14] + o.solicited_node_mcast[15] = o.next_ip[15] + -- Construct Ethernet multicast address + -- https://datatracker.ietf.org/doc/html/rfc2464#section-7 + o.mac_mcast = ethernet:pton("33:33:00:00:00:00") + o.mac_mcast[2] = o.solicited_node_mcast[12] + o.mac_mcast[3] = o.solicited_node_mcast[13] + o.mac_mcast[4] = o.solicited_node_mcast[14] + o.mac_mcast[5] = o.solicited_node_mcast[15] + end return setmetatable(o, {__index=NDP}) end @@ -313,8 +335,12 @@ function NDP:maybe_send_ns_request (output) self.next_ns_time = self.next_ns_time or engine.now() if self.next_ns_time <= engine.now() then self:ndp_resolving(self.next_ip) - transmit(self.output.south, - make_ns_packet(self.self_mac, self.self_ip, self.next_ip)) + local ns = make_ns_packet(self.self_mac, self.self_ip, + self.mac_mcast, self.solicited_node_mcast, + self.next_ip) + counter.add(self.shm["out-ndp-ns-bytes"], ns.length) + counter.add(self.shm["out-ndp-ns-packets"]) + transmit(self.output.south, ns) self.next_ns_time = engine.now() + self.ns_interval end end @@ -374,6 +400,8 @@ function NDP:handle_ndp (pkt) if not verify_icmp_checksum(pkt) then return end if h.icmpv6.type == icmpv6_na then + counter.add(self.shm["in-ndp-na-bytes"], pkt.length) + counter.add(self.shm["in-ndp-na-packets"]) -- Only process advertisements when we are looking for a -- next-hop MAC. if self.next_mac then return end @@ -406,6 +434,8 @@ function NDP:handle_ndp (pkt) -- Advertisement Message Format. self:resolve_next_hop(copy_mac(h.ether.shost)) elseif h.icmpv6.type == icmpv6_ns then + counter.add(self.shm["in-ndp-ns-bytes"], pkt.length) + counter.add(self.shm["in-ndp-ns-packets"]) if pkt.length < ndp_header_len + ffi.sizeof(ns_header_t) then return end local ns = ffi.cast(ns_header_ptr_t, h.body) if is_address_multicast(ns.target_ip) then return end @@ -433,9 +463,11 @@ function NDP:handle_ndp (pkt) end end end - link.transmit(self.output.south, - make_na_packet(self.self_mac, h.ether.shost, - self.self_ip, dst_ip, self.is_router)) + local na = make_na_packet(self.self_mac, h.ether.shost, + self.self_ip, dst_ip, self.is_router) + counter.add(self.shm["out-ndp-na-bytes"], na.length) + counter.add(self.shm["out-ndp-na-packets"]) + link.transmit(self.output.south, na) else -- Unhandled NDP packet; silently drop. return @@ -500,7 +532,9 @@ function selftest() config.link(c, "sink2.tx -> nd2.north") config.link(c, "nd2.north -> sink2.rx") engine.configure(c) - engine.main({ duration = 0.1 }) + local breaths = counter.read(engine.breaths) + local function done() return counter.read(engine.breaths)-breaths > 1 end + engine.main({ done = done }) local function mac_eq(a, b) return ffi.C.memcmp(a, b, 6) == 0 end local nd1, nd2 = engine.app_table.nd1, engine.app_table.nd2 diff --git a/src/apps/mellanox/README.md b/src/apps/mellanox/README.md new file mode 100644 index 0000000000..f348c9759c --- /dev/null +++ b/src/apps/mellanox/README.md @@ -0,0 +1,79 @@ +# Mellanox Connect-X app (apps.mellanox.connectx) + +The `connectx.ConnectX` app provides a driver for +Mellanox Connect-X 4, 5, and 6 series network cards. + +The links are named `input` and `output`. + + DIAGRAM: ConnectX + +-----------+ + | | + input ---->* ConnectX *----> output + | | + +-----------+ + +## Configuration + +— Key **pciaddress** + +*Required*. The PCI address of the NIC as a string. + +— Key **queues** + +*Required*. Array of RX/TX queue specifications. +You need to use the `connectx.IO` app to attach for I/O on each respective queue. +A queue specification is a table with the following keys: + + * `id`—a unique queue identifier string + * `vlan`—an optional VLAN identifier + * `mac`—an optional MAC address as a string + (either none or all queues must specify a MAC) + +Multiple queues with matching `vlan`/`mac` identifiers will have incoming traffic +distributed between them via 3-tuple or 5-tuple RSS. +Multicast and broadcast traffic arrives on the first queue of each RSS group. + +— Key **mtu** + +*Optional.* MTU configured for the device. The default is 9500. + +— Key **sendq_size** + +— Key **recvq_size** + +*Optional*. Sizes of the send and receive queues. The default is 1024. + + +## IO app + +The `connectx.IO` app provides a driver for a single queue of a +Mellanox Connect-X network card (see *queues*). + +The links are names `input` and `output`. + + DIAGRAM: connectx_IO + +-----------+ + | | + input ---->* IO *----> output + | | + +-----------+ +### Configuration + +— Key **pciaddress** + +*Required*. The PCI address of the NIC as a string. + +— Key **queue** + +*Required*. The queue identifier of the respective queue. + +## Supported Hardware + +This driver has been confirmed to work with +Mellanox Connect-X 4, 5, and 6 series cards. + +## Unsupported features + +* VLAN promiscuous mode is not supported + (i.e., queues that specify `vlan` but no `mac`) +* Local-loopback between queues is not implemented \ No newline at end of file diff --git a/src/apps/mellanox/connectx.lua b/src/apps/mellanox/connectx.lua new file mode 100644 index 0000000000..9a9b2e1184 --- /dev/null +++ b/src/apps/mellanox/connectx.lua @@ -0,0 +1,2444 @@ +-- Device driver for the Mellanox ConnectX-4+ Ethernet controller family. +-- Use of this source code is governed by the Apache 2.0 license; see COPYING. + +-- This is a device driver for Mellanox ConnectX family ethernet +-- cards. This driver is completely stand-alone and does not depend on +-- any other software such as Mellanox OFED library or the Linux mlx5 +-- driver. +-- +-- Thanks are due to Mellanox and Deutsche Telekom for making it +-- possible to develop this driver based on publicly available +-- information. Mellanox supported this work by releasing an edition +-- of their Programming Reference Manual (PRM) that is not subject to +-- confidentiality restrictions. This is now a valuable resource to +-- independent open source developers everywhere (spread the word!) +-- +-- Special thanks to Normen Kowalewski and Rainer Schatzmayer. + +-- General notes about this implementation: +-- +-- The driver is based primarily on the PRM: +-- http://www.mellanox.com/related-docs/user_manuals/Ethernet_Adapters_Programming_Manual.pdf +-- +-- The Linux mlx5_core driver is also used for reference. This +-- driver implements the same hexdump format as mlx5_core so it is +-- possible to directly compare/diff the binary encoded commands +-- that the drivers send. +-- +-- Physical addresses are always used for DMA (rlkey). + +module(...,package.seeall) + +local ffi = require "ffi" +local C = ffi.C +local lib = require("core.lib") +local sync = require("core.sync") +local pci = require("lib.hardware.pci") +local register = require("lib.hardware.register") +local index_set = require("lib.index_set") +local macaddress = require("lib.macaddress") +local mib = require("lib.ipc.shmem.mib") +local timer = require("core.timer") +local shm = require("core.shm") +local counter = require("core.counter") +local bits, bitset = lib.bits, lib.bitset +local floor = math.floor +local cast = ffi.cast +local ethernet = require("lib.protocol.ethernet") + +local band, bor, shl, shr, bswap, bnot = + bit.band, bit.bor, bit.lshift, bit.rshift, bit.bswap, bit.bnot +local cast, typeof = ffi.cast, ffi.typeof + +local debug_trace = false -- Print trace messages +local debug_hexdump = false -- Print hexdumps (in Linux mlx5 format) + +-- Maximum size of a receive queue table. +-- XXX This is hard-coded in the Linux mlx5 driver too. Could +-- alternatively detect from query_hca_cap. +local rqt_max_size = 128 + +--------------------------------------------------------------- +-- CXQ (ConnectX Queue pair) control object: +-- +-- A "CXQ" is an object that we define to represent a transmit/receive pair. +-- +-- CXQs are created and deleted by a "Control" app and, in between, +-- they are used by "IO" apps to send and receive packets. +-- +-- The lifecycle of a CXQ is managed using a state machine. This is +-- necessary because we allow Control and IO apps to start in any +-- order, for Control and IO apps to start/stop/restart independently, +-- for multiple IO apps to attempt to attach to the same CXQ, and even +-- for apps to stop in one Snabb process and be started in another +-- one. +-- +-- (This design may turn out to be overkill if we discover in the +-- future that we do not need this much flexibility. Time will tell.) +--------------------------------------------------------------- + +-- CXQs can be in one of five states: +-- INIT: CXQ is being initialized by the control app +-- FREE: CXQ is ready and available for use by an IO app. +-- IDLE: CXQ is owned by an app, but not actively processing right now. +-- BUSY: CXQ is owned by an app and is currently processing (e.g. push/pull). +-- DEAD: CXQ has been deallocated; IO app must try to open a new one. +-- +-- Once a CXQ is closed it stays in the DEAD state forever. However, a +-- replacement CXQ with the same name can be created and existing IO +-- apps can reattach to that instead. This will rerun the state machine. +-- +-- Here are the valid state transitions & when they occur: +-- +-- App Change Why +-- ---- ----------- -------------------------------------------------------- +-- CTRL none->INIT: Control app starts initialization. +-- CTRL INIT->FREE: Control app completes initialization. +-- IO FREE->IDLE: IO app starts and becomes owner of the CXQ. +-- IO IDLE->FREE: IO app stops and releases the CXQ for future use. +-- IO IDLE->BUSY: IO app starts running a pull/push method. +-- IO BUSY->IDLE: IO app stops running a pull/push method. +-- CTRL IDLE->DEAD: Control app closes the CXQ. (Replacement can be created.) +-- CTRL FREE->DEAD: Control app closes the CXQ. (Replacement can be created.) +-- +-- These state transitions are *PROHIBITED* for important reasons: +-- +-- App Change Why *PROHIBITED* +-- ------ ----------- -------------------------------------------------------- +-- CTRL BUSY->DEAD Cannot close a CXQ while it is busy (must wait.) +-- IO DEAD->BUSY Cannot use a CXQ that is closed (must check.) +-- * DEAD->* Cannot transition from DEAD (must create new CXQ.) +-- +-- Further notes: +-- +-- Packet buffers for pending DMA (transmit or receive) are freed by +-- the Control app (which can disable DMA first) rather than by the IO +-- app (which shuts down with DMA still active.) + +-- A CXQ is represented by one struct allocated in shared memory. +-- +-- The struct defines the fields in very specific terms so that it can +-- be used directly by the driver code (rather than copying back and +-- forth between the shared memory object and a separate native +-- format.) +local cxq_t = ffi.typeof([[ + struct { + int state[1]; // current state / availability + + // configuration information: + uint32_t sqn; // send queue number + uint32_t sqsize; // send queue size + uint32_t uar; // user access region + uint32_t rlkey; // rlkey for value + uint32_t rqn; // receive queue number + uint32_t rqsize; // receive queue size + + // DMA structures: + // doorbell contains send/receive ring cursor positions + struct { uint32_t receive, send; } *doorbell; + + // receive work queue + struct { uint32_t length, lkey, dma_hi, dma_lo; } *rwq; + + // send work queue and send/receive completion queues + union { uint8_t u8[64]; uint32_t u32[0]; uint64_t u64[0];} *swq, *scq, *rcq; + + // The tx and rx lists must each be large enough for the maximum + // queue size, which currently is 32768. We should probably add + // a check for that. + + // Transmit state + struct packet *tx[64*1024]; // packets queued for transmit + uint16_t next_tx_wqeid; // work queue ID for next transmit descriptor + uint64_t *bf_next, *bf_alt; // "blue flame" to ring doorbell (alternating) + + // Receive state + struct packet *rx[64*1024]; // packets queued for receive + uint16_t next_rx_wqeid; // work queue ID for next receive descriptor + uint16_t next_rx_cqeid; // completion queue ID of next completed packet + int rx_mine; // CQE ownership value that means software-owned + } +]]) + +-- CXQ states: +local INIT = 0 -- Implicit initial state due to 0 value. +local BUSY = 1 +local IDLE = 2 +local FREE = 3 +local DEAD = 4 + +-- Release CXQ after process termination. Called from +-- core.main.shutdown +function shutdown(pid) + for _, pciaddr in ipairs(shm.children("/"..pid.."/mellanox")) do + for _, queue in ipairs(shm.children("/"..pid.."/mellanox/"..pciaddr)) do + -- NB: this iterates the backlinks created by IO apps! + -- Meaning, this cleans up CXQ attachments from dying IO apps. + -- The actual CXQ objects are cleaned up in the process running + -- the Control app (see ConnectX:stop()). + -- The code below is just to make sure crashing IO apps do not block + -- the Control app. + local backlink = "/"..pid.."/mellanox/"..pciaddr.."/"..queue + local shm_name = "/"..pid.."/group/pci/"..pciaddr.."/"..queue + if shm.exists(shm_name) then + local cxq = shm.open(shm_name, cxq_t) + assert(sync.cas(cxq.state, IDLE, FREE) or + sync.cas(cxq.state, BUSY, FREE), + "ConnectX: failed to free "..shm_name.. + " during shutdown") + end + shm.unlink(backlink) + end + end +end + +--------------------------------------------------------------- +-- ConnectX Snabb app. +-- +-- Uses the driver routines to implement ConnectX-4 support in +-- the Snabb app network. +--------------------------------------------------------------- + +ConnectX = {} +ConnectX.__index = ConnectX + +local mlx_types = { + ["0x1013" ] = 4, -- ConnectX4 + ["0x1017" ] = 5, -- ConnectX5 + ["0x1019" ] = 5, -- ConnectX5 + ["0x101d" ] = 6, -- ConnectX6 +} + +function ConnectX:new (conf) + local self = setmetatable({}, self) + local pciaddress = pci.qualified(conf.pciaddress) + local device_info = pci.device_info(pciaddress) + self.mlx = assert(mlx_types[device_info.device], + "Unsupported device "..device_info.device) + + local sendq_size = conf.sendq_size or 1024 + local recvq_size = conf.recvq_size or 1024 + + local mtu = conf.mtu or 9500 + + -- Perform a hard reset of the device to bring it into a blank state. + -- + -- Reset is performed at PCI level instead of via firmware command. + -- This is intended to be robust to problems like bad firmware states. + pci.unbind_device_from_linux(pciaddress) + pci.reset_device(pciaddress) + pci.set_bus_master(pciaddress, true) + + -- Setup the command channel + -- + local fd = pci.open_pci_resource_locked(pciaddress, 0) + local mmio = pci.map_pci_memory(fd) + local init_seg = InitializationSegment:new(mmio) + local hca_factory = HCA_factory(init_seg) + local hca = hca_factory:new() + + -- Makes enable_hca() hang with ConnectX5 + if self.mlx == 4 then + init_seg:reset() + end + if debug_trace then init_seg:dump() end + while not init_seg:ready() do + C.usleep(1000) + end + + -- Boot the card + -- + hca:enable_hca() + hca:set_issi(1) + hca:alloc_pages(hca:query_pages("boot")) + local max_cap = hca:query_hca_general_cap('max') + if debug_trace then self:dump_capabilities(hca) end + + -- Initialize the card + -- + hca:alloc_pages(hca:query_pages("init")) + hca:init_hca() + hca:alloc_pages(hca:query_pages("regular")) + + if debug_trace then self:check_vport() end + + hca:set_port_mtu(mtu) + hca:modify_nic_vport_context(mtu, true, true, true) + + -- Create basic objects that we need + -- + local uar = hca:alloc_uar() + local eq = hca:create_eq(uar) + local pd = hca:alloc_protection_domain() + local tdomain = hca:alloc_transport_domain() + local rlkey = hca:query_rlkey() + + -- CXQ objects managed by this control app + local cxq_shm = {} + + -- List of all receive queues for hashing traffic across + local rqlist = {} + local rqs = {} + + -- List of queue counter IDs (ConnectX5 and up) + local counter_set_ids = {} + + -- Enable MAC/VLAN switching? + local usemac = false + local usevlan = false + + -- Lists of receive queues by macvlan (used if usemac=true) + local macvlan_rqlist = {} + + for _, queue in ipairs(conf.queues) do + -- Create a shared memory object for controlling the queue pair + local shmpath = "group/pci/"..pciaddress.."/"..queue.id + local cxq = shm.create(shmpath, cxq_t) + cxq_shm[shmpath] = cxq + + local function check_qsize (type, size) + assert(check_pow2(size), + string.format("%s: %s queue size must be a power of 2: %d", + conf.pciaddress, type, size)) + assert(log2size(size) <= max_cap['log_max_wq_sz'], + string.format("%s: %s queue size too big: requested %d, allowed %d", + conf.pciaddress, type, size, + math.pow(2, max_cap['log_max_wq_sz']))) + end + + check_qsize("Send", sendq_size) + check_qsize("Receive", recvq_size) + + cxq.rlkey = rlkey + cxq.sqsize = sendq_size + cxq.rqsize = recvq_size + cxq.uar = uar + local scqn, scqe = hca:create_cq(1, uar, eq.eqn, true) + local rcqn, rcqe = hca:create_cq(recvq_size, uar, eq.eqn, false) + cxq.scq = cast(typeof(cxq.scq), scqe) + cxq.rcq = cast(typeof(cxq.rcq), rcqe) + cxq.doorbell = cast(typeof(cxq.doorbell), memory.dma_alloc(16)) + + local rq_stride = ffi.sizeof(ffi.typeof(cxq.rwq[0])) + local sq_stride = ffi.sizeof(ffi.typeof(cxq.swq[0])) + local workqueues = memory.dma_alloc(sq_stride * sendq_size + + rq_stride *recvq_size, 4096) + cxq.rwq = cast(ffi.typeof(cxq.rwq), workqueues) + cxq.swq = cast(ffi.typeof(cxq.swq), workqueues + rq_stride * recvq_size) + -- Create the queue objects + local tis = hca:create_tis(0, tdomain) + local counter_set_id + if self.mlx > 4 then + counter_set_id = hca:alloc_q_counter() + table.insert(counter_set_ids, counter_set_id) + end + -- XXX order check + cxq.sqn = hca:create_sq(scqn, pd, sq_stride, sendq_size, + cxq.doorbell, cxq.swq, uar, tis) + cxq.rqn = hca:create_rq(rcqn, pd, rq_stride, recvq_size, + cxq.doorbell, cxq.rwq, + counter_set_id) + hca:modify_sq(cxq.sqn, 0, 1) -- RESET -> READY + hca:modify_rq(cxq.rqn, 0, 1) -- RESET -> READY + + -- CXQ is now fully initialized & ready for attach. + assert(sync.cas(cxq.state, INIT, FREE)) + + usemac = usemac or (queue.mac ~= nil) + usevlan = usevlan or (queue.vlan ~= nil) + + -- XXX collect for flow table construction + rqs[queue.id] = cxq.rqn + rqlist[#rqlist+1] = cxq.rqn + end + + if usemac then + -- Collect macvlan_rqlist for flow table construction + for _, queue in ipairs(conf.queues) do + assert(queue.mac, "Queue does not specifiy MAC: "..queue.id) + if usevlan then + assert(queue.vlan, "Queue does not specify a VLAN: "..queue.id) + end + local vlan = queue.vlan or false + local mac = queue.mac + if not macvlan_rqlist[vlan] then + macvlan_rqlist[vlan] = {} + end + if not macvlan_rqlist[vlan][mac] then + macvlan_rqlist[vlan][mac] = {} + end + table.insert(macvlan_rqlist[vlan][mac], rqs[queue.id]) + end + elseif usevlan then + error("NYI: promisc vlan") + end + + local function setup_rss_rxtable (rqlist, tdomain, level) + -- Set up RSS accross all queues. Hashing is only performed for + -- IPv4/IPv6 with or without TCP/UDP. All non-IP packets are + -- mapped to Queue #1. Hashing is done by the TIR for a + -- specific combination of header values, hence separate flows + -- are needed to provide each TIR with the appropriate types of + -- packets. + local l3_protos = { 'v4', 'v6' } + local l4_protos = { 'udp', 'tcp' } + local rxtable = hca:create_flow_table( + -- #rules = #l3*l4 rules + #l3 rules + 1 wildcard rule + NIC_RX, level, #l3_protos * #l4_protos + #l3_protos + 1 + ) + local rqt = hca:create_rqt(rqlist) + local index = 0 + -- Match TCP/UDP packets + local flow_group_ip = hca:create_flow_group_ip( + rxtable, NIC_RX, index, index + #l3_protos * #l4_protos - 1 + ) + for _, l3_proto in ipairs(l3_protos) do + for _, l4_proto in ipairs(l4_protos) do + local tir = hca:create_tir_indirect(rqt, tdomain, + l3_proto, l4_proto) + -- NOTE: flow table entries will only match if the packet + -- contains the complete L4 header. Keep this in mind when + -- processing truncated packets (e.g. from a port-mirror). + -- If the header is incomplete, the packet will fall through + -- to the wildcard match and end up in the first queue. + hca:set_flow_table_entry_ip(rxtable, NIC_RX, flow_group_ip, + index, TIR, tir, l3_proto, l4_proto) + index = index + 1 + end + end + -- Fall-through for non-TCP/UDP IP packets + local flow_group_ip_l3 = hca:create_flow_group_ip( + rxtable, NIC_RX, index, index + #l3_protos - 1, "l3-only" + ) + for _, l3_proto in ipairs(l3_protos) do + local tir = hca:create_tir_indirect(rqt, tdomain, l3_proto, nil) + hca:set_flow_table_entry_ip(rxtable, NIC_RX, flow_group_ip_l3, + index, TIR, tir, l3_proto, nil) + index = index + 1 + end + -- Fall-through for non-IP packets + local flow_group_wildcard = + hca:create_flow_group_wildcard(rxtable, NIC_RX, index, index) + local tir_q1 = hca:create_tir_direct(rqlist[1], tdomain) + hca:set_flow_table_entry_wildcard(rxtable, NIC_RX, + flow_group_wildcard, index, TIR, tir_q1) + return rxtable + end + + local function setup_macvlan_rxtable (macvlan_rqlist, usevlan, tdomain, level) + -- Set up MAC+VLAN switching. + -- + -- For Unicast switch [MAC+VLAN->RSS->TIR]. I.e., forward packets + -- destined for a MAC+VLAN tuple to a RSS table containing all queues + -- belonging to that tuple. + -- (See notes on RSS in setup_rss_rxtable above.) + -- + -- For Multicast switch [VLAN->TIR+]. I.e., forward multicast packets + -- destined for a VLAN to the first queue of every MAC in that VLAN. + -- + local macvlan_size, mcast_size = 0, 0 + for vlan in pairs(macvlan_rqlist) do + mcast_size = mcast_size + 1 + for mac in pairs(macvlan_rqlist[vlan]) do + macvlan_size = macvlan_size + 1 + end + end + local rxtable = hca:create_flow_table( + NIC_RX, level, macvlan_size + mcast_size + ) + local index = 0 + -- Unicast flow table entries + local flow_group_macvlan = hca:create_flow_group_macvlan( + rxtable, NIC_RX, index, index + macvlan_size - 1, usevlan + ) + for vlan in pairs(macvlan_rqlist) do + for mac, rqlist in pairs(macvlan_rqlist[vlan]) do + local tid = setup_rss_rxtable(rqlist, tdomain, 1) + hca:set_flow_table_entry_macvlan(rxtable, NIC_RX, flow_group_macvlan, index, + FLOW_TABLE, tid, macaddress:new(mac), vlan) + index = index + 1 + end + end + -- Multicast flow table entries + local flow_group_mcast = hca:create_flow_group_macvlan( + rxtable, NIC_RX, index, index + mcast_size - 1, usevlan, 'mcast' + ) + local mac_mcast = macaddress:new("01:00:00:00:00:00") + for vlan in pairs(macvlan_rqlist) do + local mcast_tirs = {} + for mac, rqlist in pairs(macvlan_rqlist[vlan]) do + mcast_tirs[#mcast_tirs+1] = hca:create_tir_direct(rqlist[1], tdomain) + end + hca:set_flow_table_entry_macvlan(rxtable, NIC_RX, flow_group_mcast, index, + TIR, mcast_tirs, mac_mcast, vlan, 'mcast') + index = index + 1 + end + return rxtable + end + + if usemac then + local rxtable = setup_macvlan_rxtable(macvlan_rqlist, usevlan, tdomain, 0) + hca:set_flow_table_root(rxtable, NIC_RX) + else + local rxtable = setup_rss_rxtable(rqlist, tdomain, 0) + hca:set_flow_table_root(rxtable, NIC_RX) + end + + self.shm = { + mtu = {counter, mtu}, + txdrop = {counter} + } + + local vport_context = hca:query_nic_vport_context() + local frame = { + dtime = {counter, C.get_unix_time()}, + -- Keep a copy of the mtu here to have all + -- data available in a single shm frame + mtu = {counter, mtu}, + speed = {counter}, + status = {counter, 2}, -- Link down + type = {counter, 0x1000}, -- ethernetCsmacd + promisc = {counter, vport_context.promisc_all}, + macaddr = {counter, vport_context.permanent_address.bits}, + rxbytes = {counter}, + rxpackets = {counter}, + rxmcast = {counter}, + rxbcast = {counter}, + rxdrop = {counter}, + rxerrors = {counter}, + txbytes = {counter}, + txpackets = {counter}, + txmcast = {counter}, + txbcast = {counter}, + txdrop = {counter}, + txerrors = {counter}, + } + self.stats = shm.create_frame("pci/"..pciaddress, frame) + + -- Create separate HCAs to retreive port statistics. Those + -- commands must be called asynchronously to reduce latency. + self.stats_reqs = { + { + start_fn = HCA.get_port_stats_start, + finish_fn = HCA.get_port_stats_finish, + process_fn = function (r, stats) + local set = counter.set + set(stats.rxbytes, r.rxbytes) + set(stats.rxpackets, r.rxpackets) + set(stats.rxmcast, r.rxmcast) + set(stats.rxbcast, r.rxbcast) + if self.mlx == 4 then + -- ConnectX 4 doesn't have per-queue drop stats, + -- but this counter appears to always be zero :/ + set(stats.rxdrop, r.rxdrop) + end + set(stats.rxerrors, r.rxerrors) + set(stats.txbytes, r.txbytes) + set(stats.txpackets, r.txpackets) + set(stats.txmcast, r.txmcast) + set(stats.txbcast, r.txbcast) + set(stats.txdrop, r.txdrop) + set(stats.txerrors, r.txerrors) + end + }, + { + start_fn = HCA.get_port_speed_start, + finish_fn = HCA.get_port_speed_finish, + process_fn = function (r, stats) + counter.set(stats.speed, r) + end + }, + { + start_fn = HCA.get_port_status_start, + finish_fn = HCA.get_port_status_finish, + process_fn = function (r, stats) + counter.set(stats.status, (r.oper_status == 1 and 1) or 2) + end + }, + } + + -- Empty for ConnectX4 + for _, id in ipairs(counter_set_ids) do + table.insert(self.stats_reqs, + { + start_fn = HCA.query_q_counter_start, + finish_fn = HCA.query_q_counter_finish, + args = { set_id = id }, + process_fn = function(r, stats) + -- Incremental update relies on query_q_counter to + -- clear the counter after read. + counter.set(stats.rxdrop, + counter.read(stats.rxdrop) + r.out_of_buffer) + end + }) + end + + for _, req in ipairs(self.stats_reqs) do + req.hca = hca_factory:new() + -- Post command + req.start_fn(req.hca, req.args) + end + self.sync_timer = lib.throttle(1) + + function free_cxq (cxq) + -- Force CXQ state -> DEAD + local timeout = lib.timeout(2) + lib.waitfor(function () + assert(not timeout(), "ConnectX: failed to close CXQ.") + return sync.cas(cxq.state, IDLE, DEAD) + or sync.cas(cxq.state, FREE, DEAD) + end) + -- Reclaim packets + for idx=0, cxq.rqsize-1 do + if cxq.rx[idx] ~= nil then + packet.free(cxq.rx[idx]) + cxq.rx[idx] = nil + end + end + for idx=0, cxq.sqsize-1 do + if cxq.tx[idx] ~= nil then + packet.free(cxq.tx[idx]) + cxq.tx[idx] = nil + end + end + end + + function self:stop () + pci.set_bus_master(pciaddress, false) + pci.reset_device(pciaddress) + pci.close_pci_resource(fd, mmio) + mmio, fd = nil + for shmpath, cxq in pairs(cxq_shm) do + free_cxq(cxq) + shm.unlink(shmpath) + end + end + + function self:pull () + if self.sync_timer() then + self:sync_stats() + end + end + + function self:sync_stats () + for _, req in ipairs(self.stats_reqs) do + local hca = req.hca + if hca:completed() then + req.process_fn(req.finish_fn(hca), self.stats) + hca:post() + end + end + end + + -- Save "instance variable" values. + self.hca = hca + + return self +end + +function ConnectX:dump_capabilities (hca) + --if true then return end + -- Print current and maximum card capabilities. + -- XXX Check if we have any specific requirements that we need to + -- set and/or assert on. + local cur = hca:query_hca_general_cap('current') + local max = hca:query_hca_general_cap('max') + print'Capabilities - current and (maximum):' + for k in pairs(cur) do + print((" %-24s = %-3s (%s)"):format(k, cur[k], max[k])) + end +end + +function ConnectX:check_vport () + if true then return end + local vport_ctx = hca:query_nic_vport_context() + for k,v in pairs(vport_ctx) do + print(k,v) + end + local vport_state = hca:query_vport_state() + for k,v in pairs(vport_state) do + print(k,v) + end +end + +function ConnectX:print_vport_counter () + local c = self.hca:query_vport_counter() + local t = {} + -- Sort into key order + for k in pairs(c) do table.insert(t, k) end + table.sort(t) + for _, k in pairs(t) do + print(("%12s %s"):format(lib.comma_value(c[k]), k)) + end +end + +--------------------------------------------------------------- +-- Firmware commands. +-- +-- Code for sending individual messages to the firmware. +-- These messages are defined in the "Command Reference" section +-- of the Mellanox Programmer Reference Manual (PRM). +-- +-- (See further below for the implementation of the command interface.) +--------------------------------------------------------------- + +-- These commands are all built on a handful of primitives for sending +-- commands to the HCA. The parameters to these functions are chosen +-- to be easy to cross-reference with the definitions in the PRM. +-- +-- command(name, last_input_offset, last_output_offset) +-- Start preparing a command for the HCA. +-- The input and output sizes are given as the offsets of their +-- last dwords. +-- The command name is given only for debugging purposes. +-- +-- input(name, offset, highbit, lowbit, value) +-- Specify an input parameter to the current command. +-- The parameter value is stored in the given bit-range at the +-- given offset. +-- The parameter name is given only for debugging purposes. +-- +-- execute() +-- Execute the command specified starting with the most recent +-- call to command(). +-- If the command fails then an exception is raised. +-- +-- output(offset, highbit, lowbit) +-- Return a value from the output of the command. + +-- Note: Parameters are often omitted when their default value (zero) +-- is sensible. Exceptions are made for more important ones. + +-- hca object is the main interface towards the NIC firmware. +HCA = {} + +-- Create a factory for HCAs for the given Initialization Segment +-- (i.e. device). Application of the new() method to the returned +-- object allocates a new HCA for the next available Command Queue +-- Entry. +function HCA_factory (init_seg, cmdq_size) + local self = {} + self.size = 2^init_seg:log_cmdq_size() + self.stride = 2^init_seg:log_cmdq_stride() + self.init_seg = init_seg + -- Next queue to be allocated by :new() + self.nextq = 0 + local cmdq_size = cmdq_size or self.size + assert(cmdq_size <= self.size, "command queue size limit exceeded") + local cmdq_t = ffi.typeof("uint8_t (*)[$]", self.stride) + local entries, entries_phy = memory.dma_alloc(cmdq_size * self.stride, 4096) + self.entries = ffi.cast(cmdq_t, entries) + init_seg:cmdq_phy_addr(entries_phy) + return setmetatable(self, { __index = HCA }) +end + +--------------------------------------------------------------- +-- Startup & General commands +--------------------------------------------------------------- + +-- Turn on the NIC. +function HCA:enable_hca () + self:command("ENABLE_HCA", 0x0C, 0x08) + :input("opcode", 0x00, 31, 16, 0x104) + :execute() +end + +-- Initialize the NIC firmware. +function HCA:init_hca () + self:command("INIT_HCA", 0x0c, 0x0c) + :input("opcode", 0x00, 31, 16, 0x102) + :execute() +end + +-- Set the software-firmware interface version to use. +function HCA:set_issi (issi) + self:command("SET_ISSI", 0x0C, 0x0C) + :input("opcode", 0x00, 31, 16, 0x10B) + :input("issi", 0x08, 15, 0, issi) + :execute() +end + +-- Query the value of the "reserved lkey" for using physical addresses. +function HCA:query_rlkey () + self:command("QUERY_SPECIAL_CONTEXTS", 0x0C, 0x0C) + :input("opcode", 0x00, 31, 16, 0x203) + :execute() + local rlkey = self:output(0x0C, 31, 0) + return rlkey +end + +-- Query how many pages of memory the NIC needs. +function HCA:query_pages (which) + self:command("QUERY_PAGES", 0x0C, 0x0C) + :input("opcode", 0x00, 31, 16, 0x107) + :input("opmod", 0x04, 15, 0, ({boot=1,init=2,regular=3})[which]) + :execute() + return self:output(0x0C, 31, 0) +end + +-- Provide the NIC with freshly allocated memory. +function HCA:alloc_pages (num_pages) + assert(num_pages > 0) + self:command("MANAGE_PAGES", 0x14 + num_pages*8, 0x0C) + :input("opcode", 0x00, 31, 16, 0x108) + :input("opmod", 0x04, 15, 0, 1) -- allocate mode + :input("input_num_entries", 0x0C, 31, 0, num_pages, "input_num_entries") + for i=0, num_pages-1 do + local _, phy = memory.dma_alloc(4096, 4096) + self:input(nil, 0x10 + i*8, 31, 0, ptrbits(phy, 63, 32)) + self:input(nil, 0x14 + i*8, 31, 12, ptrbits(phy, 31, 12)) + end + self:execute() +end + +-- Query the NIC capabilities (maximum or current setting). +function HCA:query_hca_general_cap (max_or_current) + local opmod = assert(({max=0, current=1})[max_or_current]) + self:command("QUERY_HCA_CAP", 0x0C, 0x100C - 3000) + :input("opcode", 0x00, 31, 16, 0x100) + :input("opmod", 0x04, 0, 0, opmod) + :execute() + return { + log_max_cq_sz = self:output(0x10 + 0x18, 23, 16), + log_max_cq = self:output(0x10 + 0x18, 4, 0), + log_max_eq_sz = self:output(0x10 + 0x1C, 31, 24), + log_max_mkey = self:output(0x10 + 0x1C, 21, 16), + log_max_eq = self:output(0x10 + 0x1C, 3, 0), + max_indirection = self:output(0x10 + 0x20, 31, 24), + log_max_mrw_sz = self:output(0x10 + 0x20, 22, 16), + log_max_klm_list_size = self:output(0x10 + 0x20, 5, 0), + end_pad = self:output(0x10 + 0x2C, 31, 31), + start_pad = self:output(0x10 + 0x2C, 28, 28), + cache_line_128byte = self:output(0x10 + 0x2C, 27, 27), + vport_counters = self:output(0x10 + 0x30, 30, 30), + vport_group_manager = self:output(0x10 + 0x34, 31, 31), + nic_flow_table = self:output(0x10 + 0x34, 25, 25), + port_type = self:output(0x10 + 0x34, 9, 8), + num_ports = self:output(0x10 + 0x34, 7, 0), + log_max_msg = self:output(0x10 + 0x38, 28, 24), + max_tc = self:output(0x10 + 0x38, 19, 16), + cqe_version = self:output(0x10 + 0x3C, 3, 0), + cmdif_checksum = self:output(0x10 + 0x40, 15, 14), + wq_signature = self:output(0x10 + 0x40, 11, 11), + sctr_data_cqe = self:output(0x10 + 0x40, 10, 10), + eth_net_offloads = self:output(0x10 + 0x40, 3, 3), + cq_oi = self:output(0x10 + 0x44, 31, 31), + cq_resize = self:output(0x10 + 0x44, 30, 30), + cq_moderation = self:output(0x10 + 0x44, 29, 29), + cq_eq_remap = self:output(0x10 + 0x44, 25, 25), + scqe_break_moderation = self:output(0x10 + 0x44, 21, 21), + cq_period_start_from_cqe = self:output(0x10 + 0x44, 20, 20), + imaicl = self:output(0x10 + 0x44, 14, 14), + xrc = self:output(0x10 + 0x44, 3, 3), + ud = self:output(0x10 + 0x44, 2, 2), + uc = self:output(0x10 + 0x44, 1, 1), + rc = self:output(0x10 + 0x44, 0, 0), + uar_sz = self:output(0x10 + 0x48, 21, 16), + log_pg_sz = self:output(0x10 + 0x48, 7, 0), + bf = self:output(0x10 + 0x4C, 31, 31), + driver_version = self:output(0x10 + 0x4C, 30, 30), + pad_tx_eth_packet = self:output(0x10 + 0x4C, 29, 29), + log_bf_reg_size = self:output(0x10 + 0x4C, 20, 16), + log_max_transport_domain = self:output(0x10 + 0x64, 28, 24), + log_max_pd = self:output(0x10 + 0x64, 20, 16), + max_flow_counter = self:output(0x10 + 0x68, 15, 0), + log_max_rq = self:output(0x10 + 0x6C, 28, 24), + log_max_sq = self:output(0x10 + 0x6C, 20, 16), + log_max_tir = self:output(0x10 + 0x6C, 12, 8), + log_max_tis = self:output(0x10 + 0x6C, 4, 0), + basic_cyclic_rcv_wqe = self:output(0x10 + 0x70, 31, 31), + log_max_rmp = self:output(0x10 + 0x70, 28, 24), + log_max_rqt = self:output(0x10 + 0x70, 20, 16), + log_max_rqt_size = self:output(0x10 + 0x70, 12, 8), + log_max_tis_per_sq = self:output(0x10 + 0x70, 4, 0), + log_max_stride_sz_rq = self:output(0x10 + 0x74, 28, 24), + log_min_stride_sz_rq = self:output(0x10 + 0x74, 20, 16), + log_max_stride_sz_sq = self:output(0x10 + 0x74, 12, 8), + log_min_stride_sz_sq = self:output(0x10 + 0x74, 4, 0), + log_max_wq_sz = self:output(0x10 + 0x78, 4, 0), + log_max_vlan_list = self:output(0x10 + 0x7C, 20, 16), + log_max_current_mc_list = self:output(0x10 + 0x7C, 12, 8), + log_max_current_uc_list = self:output(0x10 + 0x7C, 4, 0), + log_max_l2_table = self:output(0x10 + 0x90, 28, 24), + log_uar_page_sz = self:output(0x10 + 0x90, 15, 0), + device_frequency_mhz = self:output(0x10 + 0x98, 31, 0) + } +end + +-- Teardown the NIC firmware. +-- mode = 0 (graceful) or 1 (panic) +function HCA:teardown_hca (mode) + self:command("TEARDOWN_HCA", 0x0c, 0x0c) + :input("opcode", 0x00, 31, 16, 0x103) + :input("opmod", 0x04, 15, 0, mode) + :execute() +end + +function HCA:disable_hca () + self:command("DISABLE_HCA", 0x0c, 0x0c) + :input("opcode", 0x00, 31, 16, 0x105) + :execute() +end + +--------------------------------------------------------------- +-- Event queues +--------------------------------------------------------------- + +-- Create an event queue that can be accessed via the given UAR page number. +function HCA:create_eq (uar) + local numpages = 1 + local log_eq_size = 7 -- 128 entries + local ptr, phy = memory.dma_alloc(4096, 4096) -- memory for entries + self:command("CREATE_EQ", 0x10C + numpages*8, 0x0C) + :input("opcode", 0x00, 31, 16, 0x301) + :input("log_eq_size", 0x10 + 0x0C, 28, 24, log_eq_size) + :input("uar_page", 0x10 + 0x0C, 23, 0, uar) + :input("log_page_size", 0x10 + 0x18, 28, 24, 2) -- XXX best value? 0 or max? + :input("event bitmask", 0x10 + 0x5C, 31, 0, bits({PageRequest=0xB})) -- XXX more events? + :input("pas[0] high", 0x110, 31, 0, ptrbits(phy, 63, 32)) + :input("pas[0] low", 0x114, 31, 0, ptrbits(phy, 31, 0)) + :execute() + local eqn = self:output(0x08, 7, 0) + return eq:new(eqn, ptr, 2^log_eq_size) +end + +-- Event Queue Entry (EQE) +local eqe_t = ffi.typeof([[ + struct { + uint16_t event_type; + uint16_t event_sub_type; + uint32_t event_data; + uint16_t pad; + uint8_t signature; + uint8_t owner; + } ]] ) + +eq = {} +eq.__index = eq + +-- Create event queue object. +function eq:new (eqn, pointer, nentries) + local ring = ffi.cast(ffi.typeof("$*", eqe_t), pointer) + for i = 0, nentries-1 do + ring[i].owner = 1 + end + return setmetatable({eqn = eqn, + ring = ring, + index = 0, + n = nentries}, + self) +end + +-- Poll the queue for events. +function eq:poll() + print("Polling EQ") + local eqe = self.ring[self.index] + while eqe.owner == 0 and eqe.event_type ~= 0xFF do + self.index = self.index + 1 + eqe = self.ring[self.index % self.n] + self:event(eqe) + end + print("done polling EQ") +end + +-- Handle an event. +function eq:event () + print(("Got event %s.%s"):format(eqe.event_type, eqe.event_sub_type)) + error("Event handling not yet implemented") +end + +--------------------------------------------------------------- +-- Vport +--------------------------------------------------------------- + +function HCA:set_vport_admin_state (up) + self:command("MODIFY_VPORT_STATE", 0x0c, 0x0c) + :input("opcode", 0x00, 31, 16, 0x751) + :input("admin_state", 0x0C, 7, 4, up and 1 or 0) + :execute() +end + +function HCA:query_vport_state () + self:command("QUERY_VPORT_STATE", 0x0c, 0x0c) + :input("opcode", 0x00, 31, 16, 0x750) + :execute() + return { admin_state = self:output(0x0C, 7, 4), + oper_state = self:output(0x0C, 3, 0) } +end + +-- Convenience function +function HCA:linkup () + return self:query_vport_state().oper_state == 1 +end + +function HCA:query_vport_counter () + self:command("QUERY_VPORT_COUNTER", 0x1c, 0x20c) + :input("opcode", 0x00, 31, 16, 0x770) + :execute() + local function get64 (offset) + local hi = self:output(offset, 31, 0) + local lo = self:output(offset + 4, 31, 0) + return lo + (hi * 2^32) + end + return { + rx_error_packets = get64(0x10), + rx_error_octets = get64(0x18), + tx_error_packets = get64(0x20), + tx_error_octets = get64(0x28), + rx_bcast_packets = get64(0x70), + rx_bcast_octets = get64(0x78), + tx_bcast_packets = get64(0x80), + tx_bcast_octets = get64(0x88), + rx_ucast_packets = get64(0x90), + rx_ucast_octets = get64(0x98), + tx_ucast_packets = get64(0xA0), + tx_ucast_octets = get64(0xA8), + rx_mcast_packets = get64(0xB0), + rx_mcast_octets = get64(0xB8), + tx_mcast_packets = get64(0xC0), + tx_mcast_octets = get64(0xC8) + } +end + +function HCA:query_nic_vport_context () + self:command("QUERY_NIC_VPORT_CONTEXT", 0x0c, 0x10+0xFC) + :input("opcode", 0x00, 31, 16, 0x754) + :execute() + local mac_hi = self:output(0x10+0xF4, 31, 0) + local mac_lo = self:output(0x10+0xF8, 31, 0) + local mac = macaddress:new(bit.tohex(mac_hi, 4) .. bit.tohex(mac_lo, 8)) + return { min_wqe_inline_mode = self:output(0x10+0x00, 26, 24), + mtu = self:output(0x10+0x24, 15, 0), + promisc_uc = self:output(0x10+0xf0, 31, 31) == 1, + promisc_mc = self:output(0x10+0xf0, 30, 30) == 1, + promisc_all = self:output(0x10+0xf0, 29, 29) == 1, + permanent_address = mac } +end + +function HCA:modify_nic_vport_context (mtu, promisc_uc, promisc_mc, promisc_all) + self:command("MODIFY_NIC_VPORT_CONTEXT", 0x1FC, 0x0C) + :input("opcode", 0x00, 31, 16, 0x755) + :input("field_select", 0x0C, 31, 0, 0x50) -- MTU + promisc + :input("mtu", 0x100 + 0x24, 15, 0, mtu) + :input("promisc_uc", 0x100 + 0xF0, 31, 31, promisc_uc and 1 or 0) + :input("promisc_mc", 0x100 + 0xF0, 30, 30, promisc_mc and 1 or 0) + :input("promisc_all", 0x100 + 0xF0, 29, 29, promisc_all and 1 or 0) + :execute() +end + +--------------------------------------------------------------- +-- TIR and TIS +--------------------------------------------------------------- + +-- Allocate a Transport Domain. +function HCA:alloc_transport_domain () + self:command("ALLOC_TRANSPORT_DOMAIN", 0x0c, 0x0c) + :input("opcode", 0x00, 31, 16, 0x816) + :execute(0x0C, 0x0C) + return self:output(0x08, 23, 0) +end + +-- Create a TIR (Transport Interface Receive) with direct dispatch (no hashing) +function HCA:create_tir_direct (rqn, transport_domain) + self:command("CREATE_TIR", 0x10C, 0x0C) + :input("opcode", 0x00, 31, 16, 0x900) + :input("inline_rqn", 0x20 + 0x1C, 23, 0, rqn) + :input("transport_domain", 0x20 + 0x24, 23, 0, transport_domain) + :execute() + return self:output(0x08, 23, 0) +end + +-- Create a TIR with indirect dispatching (hashing) based on IPv4/IPv6 +-- addresses and optionally TCP/UDP ports. +function HCA:create_tir_indirect (rqt, transport_domain, l3_proto, l4_proto) + local l3_protos = { + v4 = 0, + v6 = 1 + } + local l4_protos = { + tcp = 0, + udp = 1 + } + local l3_proto = assert(l3_protos[l3_proto or 'v4'], "invalid l3 proto") + self:command("CREATE_TIR", 0x10C, 0x0C) + :input("opcode", 0x00, 31, 16, 0x900) + :input("disp_type", 0x20 + 0x04, 31, 28, 1) -- indirect + -- Symmetric hashing would sort src/dst ports prior to hashing to + -- map bi-directional traffic to the same queue. We don't need that + -- since flows are inherently uni-directional. + :input("rx_hash_symmetric",0x20 + 0x20, 31, 31, 0) -- disabled + :input("indirect_table", 0x20 + 0x20, 23, 0, rqt) + :input("rx_hash_fn", 0x20 + 0x24, 31, 28, 2) -- toeplitz + :input("transport_domain", 0x20 + 0x24, 23, 0, transport_domain) + :input("l3_prot_type", 0x20 + 0x50, 31, 31, l3_proto) + if l4_proto == nil then + self:input("selected_fields", 0x20 + 0x50, 29, 0, 3) -- SRC/DST + else + l4_proto = assert(l4_protos[l4_proto or 'tcp'], "invalid l4 proto") + self:input("l4_prot_type", 0x20 + 0x50, 30, 30, l4_proto) + :input("selected_fields", 0x20 + 0x50, 29, 0, 15) -- SRC/DST/SPORT/DPORT + end + -- XXX Is random hash key a good solution? + for i = 0x28, 0x4C, 4 do + self:input("toeplitz_key["..((i-0x28)/4).."]", 0x20 + i, 31, 0, math.random(2^32)) + end + self:execute() + return self:output(0x08, 23, 0) +end + +function HCA:create_rqt (rqlist) + -- Problem: Hardware requires number of hash buckets to be a power of 2. + -- Workaround: Setup max # hash buckets and fill with queues in a loop. + self:command("CREATE_RQT", 0x20 + 0xF0 + 4*rqt_max_size, 0x0C) + :input("opcode", 0x00, 31, 16, 0x916) + :input("rqt_max_size", 0x20 + 0x14, 15, 0, rqt_max_size) + :input("rqt_actual_size", 0x20 + 0x18, 15, 0, rqt_max_size) + for i = 0, rqt_max_size-1 do + self:input("rq_num["..i.."]", 0x20 + 0xF0 + i*4, 23, 0, rqlist[1 + (i % #rqlist)]) + end + self:execute() + return self:output(0x08, 23, 0) +end + +-- Create TIS (Transport Interface Send) +function HCA:create_tis (prio, transport_domain) + self:command("CREATE_TIS", 0x20 + 0x9C, 0x0C) + :input("opcode", 0x00, 31, 16, 0x912) + :input("prio", 0x20 + 0x00, 19, 16, prio) + :input("transport_domain", 0x20 + 0x24, 23, 0, transport_domain) + :execute() + return self:output(0x08, 23, 0) +end + +-- Allocate a UAR (User Access Region) i.e. a page of MMIO registers. +function HCA:alloc_uar () + self:command("ALLOC_UAR", 0x0C, 0x0C) + :input("opcode", 0x00, 31, 16, 0x802) + :execute() + return self:output(0x08, 23, 0) +end + +-- Allocate a Protection Domain. +function HCA:alloc_protection_domain () + self:command("ALLOC_PD", 0x0C, 0x0C) + :input("opcode", 0x00, 31, 16, 0x800) + :execute() + return self:output(0x08, 23, 0) +end + +-- Create a completion queue and return a completion queue object. +function HCA:create_cq (entries, uar_page, eqn, collapsed) + local doorbell, doorbell_phy = memory.dma_alloc(16) + -- Memory for completion queue entries + local size = entries * 64 + local cqe, cqe_phy = memory.dma_alloc(size, 4096) + local log_page_size = log2size(math.ceil(size/4096)) + ffi.fill(cqe, entries * 64, 0xFF) + self:command("CREATE_CQ", 0x114, 0x0C) + :input("opcode", 0x00, 31, 16, 0x400) + :input("cc", 0x10 + 0x00, 20, 20, collapsed and 1 or 0) + :input("oi", 0x10 + 0x00, 17, 17, 1) + :input("log_cq_size", 0x10 + 0x0C, 28, 24, log2size(entries)) + :input("uar_page", 0x10 + 0x0C, 23, 0, uar_page) + :input("c_eqn", 0x10 + 0x14, 7, 0, eqn) + :input("log_page_size", 0x10 + 0x18, 28, 24, log_page_size) + :input("db_addr high", 0x10 + 0x38, 31, 0, ptrbits(doorbell_phy, 63, 32)) + :input("db_addr_low", 0x10 + 0x3C, 31, 0, ptrbits(doorbell_phy, 31, 0)) + :input("pas[0] high", 0x110, 31, 0, ptrbits(cqe_phy, 63, 32)) + :input("pas[0] low", 0x114, 31, 0, ptrbits(cqe_phy, 31, 0)) + :execute() + local cqn = self:output(0x08, 23, 0) + return cqn, cqe +end + +-- Create a receive queue and return a receive queue object. +-- Return the receive queue number and a pointer to the WQEs. +function HCA:create_rq (cqn, pd, stride, size, doorbell, rwq, counter_set_id) + local log_wq_stride = log2size(stride) + local log_wq_size = log2size(size) + local db_phy = memory.virtual_to_physical(doorbell) + local rwq_phy = memory.virtual_to_physical(rwq) + local log_page_size = log2size(math.ceil(size * 64/4096)) + self:command("CREATE_RQ", 0x20 + 0x30 + 0xC4, 0x0C) + :input("opcode", 0x00, 31, 16, 0x908) + :input("rlkey", 0x20 + 0x00, 31, 31, 1) + :input("vlan_strip_disable", 0x20 + 0x00, 28, 28, 1) + :input("cqn", 0x20 + 0x08, 23, 0, cqn) + :input("wq_type", 0x20 + 0x30 + 0x00, 31, 28, 1) -- cyclic + :input("pd", 0x20 + 0x30 + 0x08, 23, 0, pd) + :input("dbr_addr high", 0x20 + 0x30 + 0x10, 31, 0, ptrbits(db_phy, 63, 32)) + :input("dbr_addr low", 0x20 + 0x30 + 0x14, 31, 0, ptrbits(db_phy, 31, 0)) + :input("log_wq_stride", 0x20 + 0x30 + 0x20, 19, 16, log_wq_stride) + :input("log_page_size", 0x20 + 0x30 + 0x20, 12, 8, log_page_size) + :input("log_wq_size", 0x20 + 0x30 + 0x20, 4 , 0, log_wq_size) + :input("pas[0] high", 0x20 + 0x30 + 0xC0, 63, 32, ptrbits(rwq_phy, 63, 32)) + :input("pas[0] low", 0x20 + 0x30 + 0xC4, 31, 0, ptrbits(rwq_phy, 31, 0)) + if counter_set_id then + -- Only set for ConnectX5 and higher + self:input("counter_set_id",0x20 + 0x0C, 31, 24, counter_set_id) + end + self:execute() + return self:output(0x08, 23, 0) +end + +-- Modify a Receive Queue by making a state transition. +function HCA:modify_rq (rqn, curr_state, next_state) + self:command("MODIFY_RQ", 0x20 + 0x30 + 0xC4, 0x0C) + :input("opcode", 0x00, 31, 16, 0x909) + :input("curr_state", 0x08, 31, 28, curr_state) + :input("rqn", 0x08, 27, 0, rqn) + :input("next_state", 0x20 + 0x00, 23, 20, next_state) + :execute() +end + +-- Modify a Send Queue by making a state transition. +function HCA:modify_sq (sqn, curr_state, next_state) + self:command("MODIFY_SQ", 0x20 + 0x30 + 0xC4, 0x0C) + :input("opcode", 0x00, 31, 16, 0x905) + :input("curr_state", 0x08, 31, 28, curr_state) + :input("sqn", 0x08, 23, 0, sqn) + :input("next_state", 0x20 + 0x00, 23, 20, next_state) + :execute() +end + +-- Create a Send Queue. +-- Return the send queue number and a pointer to the WQEs. +function HCA:create_sq (cqn, pd, stride, size, doorbell, swq, uar, tis) + local log_wq_stride = log2size(stride) + local log_wq_size = log2size(size) + local db_phy = memory.virtual_to_physical(doorbell) + local swq_phy = memory.virtual_to_physical(swq) + self:command("CREATE_SQ", 0x20 + 0x30 + 0xC4, 0x0C) + :input("opcode", 0x00, 31, 16, 0x904) + :input("rlkey", 0x20 + 0x00, 31, 31, 1) + :input("fre", 0x20 + 0x00, 29, 29, 1) + :input("flush_in_error_en", 0x20 + 0x00, 28, 28, 1) + :input("min_wqe_inline_mode", 0x20 + 0x00, 26, 24, 1) + :input("cqn", 0x20 + 0x08, 23, 0, cqn) + :input("tis_lst_sz", 0x20 + 0x20, 31, 16, 1) + :input("tis", 0x20 + 0x2C, 23, 0, tis) + :input("wq_type", 0x20 + 0x30 + 0x00, 31, 28, 1) -- cyclic + :input("pd", 0x20 + 0x30 + 0x08, 23, 0, pd) + :input("uar_page", 0x20 + 0x30 + 0x0C, 23, 0, uar) + :input("pas[0] high", 0x20 + 0x30 + 0x10, 31, 0, ptrbits(db_phy, 63, 32)) + :input("pas[0] low", 0x20 + 0x30 + 0x14, 31, 0, ptrbits(db_phy, 31, 0)) + :input("log_wq_stride", 0x20 + 0x30 + 0x20, 19, 16, log_wq_stride) + :input("log_wq_page_sz", 0x20 + 0x30 + 0x20, 12, 8, 6) -- XXX check + :input("log_wq_size", 0x20 + 0x30 + 0x20, 4, 0, log_wq_size) + :input("pas[0] high", 0x20 + 0x30 + 0xC0, 31, 0, ptrbits(swq_phy, 63, 32)) + :input("pas[0] low", 0x20 + 0x30 + 0xC4, 31, 0, ptrbits(swq_phy, 31, 0)) + + :execute() + return self:output(0x08, 23, 0) +end + +--------------------------------------------------------------- +-- IO app: attach to transmit and receive queues. +--------------------------------------------------------------- + +IO = {} +IO.__index = IO +-- The IO module is the device driver in the sense of +-- lib.hardware.pci.device_info +driver = IO + +function IO:new (conf) + local self = setmetatable({}, self) + + local pciaddress = pci.qualified(conf.pciaddress) + local queue = conf.queue + -- This is also done in Connectex4:new() but might not have + -- happened yet. + pci.unbind_device_from_linux(pciaddress) + local fd = pci.open_pci_resource_unlocked(pciaddress, 0) + local mmio = pci.map_pci_memory(fd) + + local online = false -- True when queue is up and running + local cxq -- shm object containing queue control information + local sq -- SQ send queue object + local rq -- RQ receive queue object + local open_throttle = -- Timer to throttle shm open attempts (10ms) + lib.throttle(0.25) + + -- Close the queue mapping. + local function close () + shm.unlink(self.backlink) + shm.unmap(cxq) + cxq = nil + end + + -- Open the queue mapping. + local function open () + local shmpath = "group/pci/"..pciaddress.."/"..queue + self.backlink = "mellanox/"..pciaddress.."/"..queue + if shm.exists(shmpath) then + shm.alias(self.backlink, shmpath) + cxq = shm.open(shmpath, cxq_t) + if sync.cas(cxq.state, FREE, IDLE) then + sq = SQ:new(cxq, mmio) + rq = RQ:new(cxq) + else + close() -- Queue was not FREE. + end + end + end + + -- Return true on successful activation of the queue. + local function activate () + -- If not open then make a request on a regular schedule. + if cxq == nil and open_throttle() then + open() + end + if cxq then + -- Careful: Control app may have closed the CXQ. + if sync.cas(cxq.state, IDLE, BUSY) then + return true + else + assert(cxq.state[0] == DEAD, "illegal state detected") + close() + end + end + end + + -- Enter the idle state. + local function deactivate () + assert(sync.cas(cxq.state, BUSY, IDLE)) + end + + -- Send packets to the NIC + function self:push () + if activate() then + sq:transmit(self.input.input or self.input.rx) + sq:reclaim() + deactivate() + end + end + + -- Receive packets from the NIC. + function self:pull () + if activate() then + rq:receive(self.output.output or self.output.tx) + rq:refill() + deactivate() + end + end + + -- Detach from the NIC. + function self:stop () + close() + end + + return self +end + +--------------------------------------------------------------- +-- Receive queue + +RQ = {} + +function RQ:new (cxq) + local rq = {} + + local mask = cxq.rqsize - 1 + -- Return the transmit queue slot for the given WQE ID. + local function slot (wqeid) + return band(wqeid, mask) + end + + -- Refill with buffers + function rq:refill () + local notify = false -- have to notify NIC with doorbell ring? + while cxq.rx[slot(cxq.next_rx_wqeid)] == nil do + local p = packet.allocate() + cxq.rx[slot(cxq.next_rx_wqeid)] = p + local rwqe = cxq.rwq[slot(cxq.next_rx_wqeid)] + local phy = memory.virtual_to_physical(p.data) + rwqe.length = bswap(packet.max_payload) + rwqe.lkey = bswap(cxq.rlkey) + rwqe.dma_hi = bswap(tonumber(shr(phy, 32))) + rwqe.dma_lo = bswap(tonumber(band(phy, 0xFFFFFFFF))) + cxq.next_rx_wqeid = cxq.next_rx_wqeid + 1 + notify = true + end + if notify then + -- ring doorbell + cxq.doorbell.receive = bswap(cxq.next_rx_wqeid) + end + end + + local function have_input () + local c = cxq.rcq[cxq.next_rx_cqeid] + local owner = bit.band(1, c.u8[0x3F]) + return owner == cxq.rx_mine + end + + function rq:receive (l) + local limit = engine.pull_npackets + while have_input() and limit > 0 and not link.full(l) do + -- Find the next completion entry. + local c = cxq.rcq[cxq.next_rx_cqeid] + limit = limit - 1 + -- Advance to next completion. + -- Note: assumes sqsize == cqsize + cxq.next_rx_cqeid = slot(cxq.next_rx_cqeid + 1) + -- Toggle the ownership value if the CQ wraps around. + if cxq.next_rx_cqeid == 0 then + cxq.rx_mine = (cxq.rx_mine + 1) % 2 + end + -- Decode the completion entry. + local opcode = shr(c.u8[0x3F], 4) + local len = bswap(c.u32[0x2C/4]) + local wqeid = shr(bswap(c.u32[0x3C/4]), 16) + local idx = slot(wqeid) + if opcode == 0 or opcode == 2 then + -- Successful receive + local p = cxq.rx[idx] + assert(p ~= nil) + p.length = len + link.transmit(l, p) + cxq.rx[idx] = nil + elseif opcode == 13 or opcode == 14 then + -- Error on receive + assert(cxq.rx[idx] ~= nil) + packet.free(cxq.rx[idx]) + cxq.rx[idx] = nil + local syndromes = { + [0x1] = "Local_Length_Error", + [0x4] = "Local_Protection_Error", + [0x5] = "Work_Request_Flushed_Error", + [0x6] = "Memory_Window_Bind_Error", + [0x10] = "Bad_Response_Error", + [0x11] = "Local_Access_Error", + [0x12] = "Remote_Invalid_Request_Error", + [0x13] = "Remote_Access_Error", + [0x14] = "Remote_Operation_Error" + } + local syndrome = c.u8[0x37] + error(("Got error. opcode=%d syndrome=0x%x message=%s") + :format(opcode, syndrome, syndromes[syndromes])) + else + error(("Unexpected CQE opcode: %d (0x%x)"):format(opcode, opcode)) + end + end + end + + function rq:ring_doorbell () + doorbell[0].receive = bswap(next_buffer) + end + + return rq +end + +--------------------------------------------------------------- +-- Send queue + +SQ = {} + +function SQ:new (cxq, mmio) + local sq = {} + -- Cast pointers to expected types + local mmio = ffi.cast("uint8_t*", mmio) + cxq.bf_next = ffi.cast("uint64_t*", mmio + (cxq.uar * 4096) + 0x800) + cxq.bf_alt = ffi.cast("uint64_t*", mmio + (cxq.uar * 4096) + 0x900) + + local mask = cxq.sqsize - 1 + -- Return the transmit queue slot for the given WQE ID. + -- (Transmit queue is a smaller power of two than max WQE ID.) + local function slot (wqeid) + return band(wqeid, mask) + end + + -- Transmit packets from the link onto the send queue. + function sq:transmit (l) + local start_wqeid = cxq.next_tx_wqeid + local next_slot = slot(start_wqeid) + while not link.empty(l) and cxq.tx[next_slot] == nil do + local p = link.receive(l) + local wqe = cxq.swq[next_slot] + -- Store packet pointer so that we can free it later + cxq.tx[next_slot] = p + + -- Construct a 64-byte transmit descriptor. + -- This is in three parts: Control, Ethernet, Data. + -- The Ethernet part includes some inline data. + + -- Control segment + wqe.u32[0] = bswap(shl(cxq.next_tx_wqeid, 8) + 0x0A) + wqe.u32[1] = bswap(shl(cxq.sqn, 8) + 4) + wqe.u32[2] = bswap(shl(2, 2)) -- completion always + -- Ethernet segment + local ninline = 16 + wqe.u32[7] = bswap(shl(ninline, 16)) + ffi.copy(wqe.u8 + 0x1E, p.data, ninline) + -- Send Data Segment (inline data) + wqe.u32[12] = bswap(p.length - ninline) + wqe.u32[13] = bswap(cxq.rlkey) + local phy = memory.virtual_to_physical(p.data + ninline) + wqe.u32[14] = bswap(tonumber(shr(phy, 32))) + wqe.u32[15] = bswap(tonumber(band(phy, 0xFFFFFFFF))) + -- Advance counters + cxq.next_tx_wqeid = cxq.next_tx_wqeid + 1 + next_slot = slot(cxq.next_tx_wqeid) + end + -- Ring the doorbell if we enqueued new packets. + if cxq.next_tx_wqeid ~= start_wqeid then + local current_packet = slot(cxq.next_tx_wqeid + cxq.sqsize-1) + cxq.doorbell.send = bswap(cxq.next_tx_wqeid) + cxq.bf_next[0] = cxq.swq[current_packet].u64[0] + -- Switch next/alternate blue flame register for next time + cxq.bf_next, cxq.bf_alt = cxq.bf_alt, cxq.bf_next + end + end + + local next_reclaim = 0 + -- Free packets when their transmission is complete. + function sq:reclaim () + local opcode = cxq.scq[0].u8[0x38] + if opcode == 0x0A then + local wqeid = shr(bswap(cxq.scq[0].u32[0x3C/4]), 16) + while next_reclaim ~= wqeid % cxq.sqsize do + assert(cxq.tx[next_reclaim] ~= nil) + packet.free(cxq.tx[next_reclaim]) + cxq.tx[next_reclaim] = nil + next_reclaim = tonumber(slot(next_reclaim + 1)) + end + end + end + + return sq +end + +NIC_RX = 0 -- Flow table type code for incoming packets +NIC_TX = 1 -- Flow table type code for outgoing packets + +FLOW_TABLE = 1 -- Flow table entry destination_type for FLOW_TABLE +TIR = 2 -- Flow table entry destination_type for TIR + +-- Create a flow table. +function HCA:create_flow_table (table_type, level, size) + self:command("CREATE_FLOW_TABLE", 0x3C, 0x0C) + :input("opcode", 0x00, 31, 16, 0x930) + :input("table_type", 0x10, 31, 24, table_type) + :input("level", 0x18 + 0x00, 23, 16, level or 0) + :input("log_size", 0x18 + 0x00, 7, 0, math.ceil(math.log(size or 1024, 2))) + :execute() + local table_id = self:output(0x08, 23, 0) + return table_id +end + +-- Set table as root flow table. +function HCA:set_flow_table_root (table_id, table_type) + self:command("SET_FLOW_TABLE_ROOT", 0x3C, 0x0C) + :input("opcode", 0x00, 31, 16, 0x92F) + :input("table_type", 0x10, 31, 24, table_type) + :input("table_id", 0x14, 23, 0, table_id) + :execute() +end + +-- Create a "wildcard" flow group that does not inspect any fields. +function HCA:create_flow_group_wildcard (table_id, table_type, start_ix, end_ix) + self:command("CREATE_FLOW_GROUP", 0x3FC, 0x0C) + :input("opcode", 0x00, 31, 16, 0x933) + :input("table_type", 0x10, 31, 24, table_type) + :input("table_id", 0x14, 23, 0, table_id) + :input("start_ix", 0x1C, 31, 0, start_ix) + :input("end_ix", 0x24, 31, 0, end_ix) -- (inclusive) + :input("match_criteria_enable", 0x3C, 7, 0, 0) -- match outer headers + :execute() + local group_id = self:output(0x08, 23, 0) + return group_id +end + +-- Set a "wildcard" flow table entry that does not match on any fields. +function HCA:set_flow_table_entry_wildcard (table_id, table_type, group_id, + flow_index, dest_type, dest_id) + self:command("SET_FLOW_TABLE_ENTRY", 0x40 + 0x300, 0x0C) + :input("opcode", 0x00, 31, 16, 0x936) + :input("opmod", 0x04, 15, 0, 0) -- new entry + :input("table_type", 0x10, 31, 24, table_type) + :input("table_id", 0x14, 23, 0, table_id) + :input("flow_index", 0x20, 31, 0, flow_index) + :input("group_id", 0x40 + 0x04, 31, 0, group_id) + :input("action", 0x40 + 0x0C, 15, 0, 4) -- action = FWD_DST + :input("dest_list_sz", 0x40 + 0x10, 23, 0, 1) -- destination list size + :input("dest_type", 0x40 + 0x300, 31, 24, dest_type) + :input("dest_id", 0x40 + 0x300, 23, 0, dest_id) + :execute() +end + +-- Create a flow group that inspects the ethertype and optionally protocol fields. +function HCA:create_flow_group_ip (table_id, table_type, start_ix, end_ix, l3_only) + self:command("CREATE_FLOW_GROUP", 0x3FC, 0x0C) + :input("opcode", 0x00, 31, 16, 0x933) + :input("table_type", 0x10, 31, 24, table_type) + :input("table_id", 0x14, 23, 0, table_id) + :input("start_ix", 0x1C, 31, 0, start_ix) + :input("end_ix", 0x24, 31, 0, end_ix) -- (inclusive) + :input("match_criteria_enable", 0x3C, 7, 0, 1) -- match outer headers + :input("match_ether", 0x40 + 0x04, 15, 0, 0xFFFF) + if l3_only == nil then + self:input("match_proto", 0x40 + 0x10, 31, 24, 0xFF) + end + self:execute() + local group_id = self:output(0x08, 23, 0) + return group_id +end + +-- Set a flow table entry that matches on the ethertype for IPv4/IPv6 +-- as well as optionally on TCP/UDP protocol/next-header. +function HCA:set_flow_table_entry_ip (table_id, table_type, group_id, + flow_index, dest_type, dest_id, l3_proto, l4_proto) + local ethertypes = { + v4 = 0x0800, + v6 = 0x86dd + } + local l4_protos = { + udp = 17, + tcp = 6 + } + local type = assert(ethertypes[l3_proto], "invalid l3 proto") + self:command("SET_FLOW_TABLE_ENTRY", 0x40 + 0x300, 0x0C) + :input("opcode", 0x00, 31, 16, 0x936) + :input("opmod", 0x04, 15, 0, 0) -- new entry + :input("table_type", 0x10, 31, 24, table_type) + :input("table_id", 0x14, 23, 0, table_id) + :input("flow_index", 0x20, 31, 0, flow_index) + :input("group_id", 0x40 + 0x04, 31, 0, group_id) + :input("action", 0x40 + 0x0C, 15, 0, 4) -- action = FWD_DST + :input("dest_list_sz", 0x40 + 0x10, 23, 0, 1) -- destination list size + :input("match_ether", 0x40 + 0x40 + 0x04, 15, 0, type) + if l4_proto ~= nil then + local proto = assert(l4_protos[l4_proto], "invalid l4 proto") + self:input("match_proto", 0x40 + 0x40 + 0x10, 31, 24, proto) + end + self:input("dest_type", 0x40 + 0x300, 31, 24, dest_type) + :input("dest_id", 0x40 + 0x300, 23, 0, dest_id) + :execute() +end + +-- Create a DMAC+VLAN flow group. +function HCA:create_flow_group_macvlan (table_id, table_type, start_ix, end_ix, usevlan, mcast) + local dmac = (mcast and macaddress:new("01:00:00:00:00:00")) + or macaddress:new("ff:ff:ff:ff:ff:ff") + self:command("CREATE_FLOW_GROUP", 0x3FC, 0x0C) + :input("opcode", 0x00, 31, 16, 0x933) + :input("table_type", 0x10, 31, 24, table_type) + :input("table_id", 0x14, 23, 0, table_id) + :input("start_ix", 0x1C, 31, 0, start_ix) + :input("end_ix", 0x24, 31, 0, end_ix) -- (inclusive) + :input("match_criteria", 0x3C, 7, 0, 1) -- match outer headers + :input("dmac0", 0x40 + 0x08, 31, 0, bswap(dmac:subbits(0,32))) + :input("dmac1", 0x40 + 0x0C, 31, 16, shr(bswap(dmac:subbits(32,48)), 16)) + if usevlan then + self:input("vlanid", 0x40 + 0x0C, 11, 0, 0xFFF) + end + self:execute() + local group_id = self:output(0x08, 23, 0) + return group_id +end + +-- Set a DMAC+VLAN flow table rule. +function HCA:set_flow_table_entry_macvlan (table_id, table_type, group_id, + flow_index, dest_type, dest_id, dmac, vlanid, mcast) + local dest_ids = (mcast and dest_id) or {dest_id} + self:command("SET_FLOW_TABLE_ENTRY", 0x40 + 0x300 + 0x8*(#dest_ids-1), 0x0C) + :input("opcode", 0x00, 31, 16, 0x936) + :input("opmod", 0x04, 15, 0, 0) -- new entry + :input("table_type", 0x10, 31, 24, table_type) + :input("table_id", 0x14, 23, 0, table_id) + :input("flow_index", 0x20, 31, 0, flow_index) + :input("group_id", 0x40 + 0x04, 31, 0, group_id) + :input("action", 0x40 + 0x0C, 15, 0, 4) -- action = FWD_DST + :input("dest_list_sz", 0x40 + 0x10, 23, 0, #dest_ids) -- destination list size + :input("dmac0", 0x40 + 0x48, 31, 0, bswap(dmac:subbits(0,32))) + :input("dmac1", 0x40 + 0x4C, 31, 16, shr(bswap(dmac:subbits(32,48)), 16)) + :input("vlan", 0x40 + 0x4C, 11, 0, vlanid or 0) + for i, dest_id in ipairs(dest_ids) do + self:input("dest_type", 0x40 + 0x300 + 0x8*(i-1), 31, 24, dest_type) + self:input("dest_id", 0x40 + 0x300 + 0x8*(i-1), 23, 0, dest_id) + end + self:execute() +end + +--------------------------------------------------------------- +-- PHY control access +--------------------------------------------------------------- + +-- Note: portnumber is always 1 because the ConnectX-4 HCA is managing +-- a single physical port. + +PMTU = 0x5003 +PTYS = 0x5004 -- Port Type and Speed +PAOS = 0x5006 -- Port Administrative & Operational Status +PPCNT = 0x5008 -- Ports Performance Counters +PPLR = 0x5018 -- Port Physical Loopback Register + +-- Mapping of speed/protocols per 11.1.2 to speed in units of gbps +local port_speed = { + [0x00000002] = 1, -- 1000Base-KX + [0x00000004] = 10, -- 10GBase-CX4 + [0x00000008] = 10, -- 10GBase-KX4 + [0x00000010] = 10, -- 10GBase-KR + [0x00000040] = 40, -- 40GBase-CR4 + [0x00000080] = 40, -- 40GBase-KR4 + [0x00001000] = 10, -- 10GBase-CR + [0x00002000] = 10, -- 10GBase-SR + [0x00004000] = 10, -- 10GBase-ER/LR + [0x00008000] = 40, -- 40GBase-SR4 + [0x00010000] = 40, -- 40GBase-LR4/ER4 + [0x00040000] = 50, -- 50GBase-SR2 + [0x00100000] = 100, -- 100GBase-CR4 + [0x00200000] = 100, -- 100GBase-SR4 + [0x00400000] = 100, -- 100GBase-KR4 + -- Undocumented (from a ConnectX5 NIC with CWDM plugin) + [0x00800000] = 100, -- 100GBase-CWDM + [0x08000000] = 25, -- 25GBase-CR + [0x10000000] = 25, -- 25GBase-KR + [0x20000000] = 25, -- 25GBase-SR + [0x40000000] = 50, -- 50GBase-CR2 + [0x80000000] = 50, -- 50GBase-KR2 +} + +-- Get the speed of the port in bps +function HCA:get_port_speed_start () + self:command("ACCESS_REGISTER", 0x4C, 0x4C) + :input("opcode", 0x00, 31, 16, 0x805) + :input("opmod", 0x04, 15, 0, 1) -- read + :input("register_id", 0x08, 15, 0, PTYS) + :input("local_port", 0x10, 23, 16, 1) + :input("proto_mask", 0x10, 2, 0, 0x4) -- Ethernet + :execute_async() +end + +function HCA:get_port_speed_finish () + local eth_proto_oper = self:output(0x10 + 0x24, 31, 0) + return (port_speed[eth_proto_oper] or 0) * 1e9 +end + +-- Set the administrative status of the port (boolean up/down). +function HCA:set_admin_status (admin_up) + self:command("ACCESS_REGISTER", 0x1C, 0x0C) + :input("opcode", 0x00, 31, 16, 0x805) + :input("opmod", 0x04, 15, 0, 0) -- write + :input("register_id", 0x08, 15, 0, PAOS) + :input("local_port", 0x10, 23, 16, 1) -- + :input("admin_status", 0x10, 11, 8, admin_up and 1 or 2) + :input("ase", 0x14, 31, 31, 1) -- enable admin state update + :execute() +end + +function HCA:set_port_mtu (mtu) + self:command("ACCESS_REGISTER", 0x1C, 0x0C) + :input("opcode", 0x00, 31, 16, 0x805) + :input("opmod", 0x04, 15, 0, 0) -- write + :input("register_id", 0x08, 15, 0, PMTU) + :input("local_port", 0x10, 23, 16, 1) + :input("admin_mtu", 0x18, 31, 16, mtu) + :execute() +end + +local port_status = { admin_status = 0, oper_status = 0 } +function HCA:get_port_status () + self:command("ACCESS_REGISTER", 0x10, 0x1C) + :input("opcode", 0x00, 31, 16, 0x805) + :input("opmod", 0x04, 15, 0, 1) -- read + :input("register_id", 0x08, 15, 0, PAOS) + :input("local_port", 0x10, 23, 16, 1) + :execute() + port_status.admin_status = self:output(0x10, 11, 8) + port_status.oper_status = self:output(0x10, 3, 0) + return port_status +end + +function HCA:get_port_status_start () + self:command("ACCESS_REGISTER", 0x10, 0x1C) + :input("opcode", 0x00, 31, 16, 0x805) + :input("opmod", 0x04, 15, 0, 1) -- read + :input("register_id", 0x08, 15, 0, PAOS) + :input("local_port", 0x10, 23, 16, 1) + :execute() +end + +function HCA:get_port_status_finish () + port_status.admin_status = self:output(0x10, 11, 8) + port_status.oper_status = self:output(0x10, 3, 0) + return port_status +end + +function HCA:get_port_loopback_capability () + self:command("ACCESS_REGISTER", 0x10, 0x14) + :input("opcode", 0x00, 31, 16, 0x805) + :input("opmod", 0x04, 15, 0, 1) -- read + :input("register_id", 0x08, 15, 0, PPLR) + :input("local_port", 0x10, 23, 16, 1) + :execute() + local capability = self:getoutbits(0x14, 23, 16) + return capability +end + +function HCA:set_port_loopback (loopback_mode) + self:command("ACCESS_REGISTER", 0x14, 0x0C) + :input("opcode", 0x00, 31, 16, 0x805) + :input("opmod", 0x04, 15, 0, 0) -- write + :input("register_id", 0x08, 15, 0, PPLR) + :input("local_port", 0x10, 23, 16, 1) + :input("loopback_mode", 0x14, 7, 0, loopback_mode and 2 or 0) + :execute() +end + +local port_stats = { + rxbytes = 0ULL, + rxmcast = 0ULL, + rxbcast = 0ULL, + rxpackets = 0ULL, + rxdrop = 0ULL, + rxerrors = 0ULL, + txbytes = 0ULL, + txmcast = 0ULL, + txbcast = 0ULL, + txpackets = 0ULL, + txdrop = 0ULL, + txerrors = 0ULL, +} +function HCA:get_port_stats_start () + self:command("ACCESS_REGISTER", 0x14, 0x10C) + :input("opcode", 0x00, 31, 16, 0x805) + :input("opmod", 0x04, 15, 0, 1) -- read + :input("register_id", 0x08, 15, 0, PPCNT) + :input("local_port", 0x10, 23, 16, 1) + :input("grp", 0x10, 5, 0, 0x1) -- RFC 2863 + :execute_async() +end + +function HCA:get_port_stats_finish () + port_stats.rxbytes = self:output64(0x18 + 0x00) -- includes 4-byte CRC + local in_ucast_packets = self:output64(0x18 + 0x08) + local in_mcast_packets = self:output64(0x18 + 0x48) + local in_bcast_packets = self:output64(0x18 + 0x50) + -- This is weird. The intel_mp driver adds broadcast packets to the + -- mcast counter, it is unclear why. Then + -- lib.ipc.shmem.iftable_mib reverses it to get the true mcast + -- counter back. So we do the same here. The proper fix would be + -- to fix the Intel driver and remove the anti-hack from + -- iftable_mib. + port_stats.rxmcast = in_mcast_packets + in_bcast_packets + port_stats.rxbcast = in_bcast_packets + port_stats.rxpackets = in_ucast_packets + port_stats.rxmcast + port_stats.rxdrop = self:output64(0x18 + 0x10) + port_stats.rxerrors = self:output64(0x18 + 0x18) + + port_stats.txbytes = self:output64(0x18 + 0x28) + local out_ucast_packets = self:output64(0x18 + 0x30) + local out_mcast_packets = self:output64(0x18 + 0x58) + local out_bcast_packets = self:output64(0x18 + 0x60) + port_stats.txmcast = out_mcast_packets + out_bcast_packets + port_stats.txbcast = out_bcast_packets + port_stats.txpackets = out_ucast_packets + port_stats.txmcast + port_stats.txdrop = self:output64(0x18 + 0x38) + port_stats.txerrors = self:output64(0x18 + 0x40) + return port_stats +end + +function HCA:alloc_q_counter() + self:command("ALLOC_Q_COUNTER", 0x18, 0x10C) + :input("opcode", 0x00, 31, 16, 0x771) + :execute() + return self:output(0x08, 7, 0) +end + +local q_stats = { + out_of_buffer = 0ULL +} +function HCA:query_q_counter_start (args) + self:command("QUERY_Q_COUNTER", 0x20, 0x10C) + :input("opcode", 0x00, 31, 16, 0x773) + -- Clear the counter after reading. This allows us to + -- update the rxdrop stat incrementally. + :input("clear", 0x18, 31, 31, 1) + :input("counter_set_id",0x1c, 7, 0, args.set_id) + :execute_async() +end + +local out_of_buffer = 0ULL +function HCA:query_q_counter_finish () + q_stats.out_of_buffer = self:output(0x10 + 0x20, 31, 0) + return q_stats +end + +--------------------------------------------------------------- +-- Command Interface implementation. +-- +-- Sends commands to the HCA firmware and receives replies. +-- Defined in "Command Interface" section of the PRM. +--------------------------------------------------------------- + +local cmdq_entry_t = ffi.typeof("uint32_t[0x40/4]") +local cmdq_mailbox_t = ffi.typeof("uint32_t[0x240/4]") + +-- XXX Check with maximum length of commands that we really use. +local max_mailboxes = 1000 +local data_per_mailbox = 0x200 -- Bytes of input/output data in a mailbox + +-- Create a command queue with dedicated/reusable DMA memory. +function HCA:new () + -- Must only be called from a factory created by HCA_factory() + assert(self ~= HCA) + local q = self.nextq + assert(q < self.size) + self.nextq = self.nextq + 1 + + local inboxes, outboxes = {}, {} + for i = 0, max_mailboxes-1 do + -- XXX overpadding.. 0x240 alignment is not accepted? + inboxes[i] = ffi.cast("uint32_t*", memory.dma_alloc(0x240, 4096)) + outboxes[i] = ffi.cast("uint32_t*", memory.dma_alloc(0x240, 4096)) + end + return setmetatable({entry = ffi.cast("uint32_t *", self.entries[q]), + inboxes = inboxes, + outboxes = outboxes, + q = q}, + {__index = self}) +end + +-- Reset all data structures to zero values. +-- This is to prevent leakage from one command to the next. +local token = 0xAA +function HCA:command (command, last_input_offset, last_output_offset) + if debug_trace then + print("HCA command: " .. command) + end + self.input_size = last_input_offset + 4 + self.output_size = last_output_offset + 4 + + -- Command entry: + + ffi.fill(self.entry, ffi.sizeof(cmdq_entry_t), 0) + self:setbits(0x00, 31, 24, 0x7) -- type + self:setbits(0x04, 31, 0, self.input_size) + self:setbits(0x38, 31, 0, self.output_size) + self:setbits(0x3C, 0, 0, 1) -- ownership = hardware + self:setbits(0x3C, 31, 24, token) + -- Mailboxes: + + -- How many mailboxes do we need? + local ninboxes = math.ceil((self.input_size - 16) / data_per_mailbox) + local noutboxes = math.ceil((self.output_size - 16) / data_per_mailbox) + if ninboxes > max_mailboxes then error("Input overflow: " ..self.input_size) end + if noutboxes > max_mailboxes then error("Output overflow: "..self.output_size) end + + if ninboxes > 0 then + local phy = memory.virtual_to_physical(self.inboxes[0]) + setint(self.entry, 0x08, phy / 2^32) + setint(self.entry, 0x0C, phy % 2^32) + end + if noutboxes > 0 then + local phy = memory.virtual_to_physical(self.outboxes[0]) + setint(self.entry, 0x30, phy / 2^32) + setint(self.entry, 0x34, phy % 2^32) + end + + -- Initialize mailboxes + for i = 0, max_mailboxes-1 do + -- Zap old state + ffi.fill(self.inboxes[i], ffi.sizeof(cmdq_mailbox_t), 0) + ffi.fill(self.outboxes[i], ffi.sizeof(cmdq_mailbox_t), 0) + -- Set mailbox block number + setint(self.inboxes[i], 0x238, i) + setint(self.outboxes[i], 0x238, i) + -- Tokens to match command entry + setint(self.inboxes[i], 0x23C, setbits(23, 16, token, 0)) + setint(self.outboxes[i], 0x23C, setbits(23, 16, token, 0)) + -- Set 'next' mailbox pointers (when used) + if i < ninboxes then + local phy = memory.virtual_to_physical(self.inboxes[i+1]) + setint(self.inboxes[i], 0x230, phy / 2^32) + setint(self.inboxes[i], 0x234, phy % 2^32) + end + if i < noutboxes then + local phy = memory.virtual_to_physical(self.outboxes[i+1]) + setint(self.outboxes[i], 0x230, phy / 2^32) + setint(self.outboxes[i], 0x234, phy % 2^32) + end + end + token = (token == 255) and 1 or token+1 + return self -- for method call chaining +end + +function HCA:getbits (offset, hi, lo) + return getbits(getint(self.entry, offset), hi, lo) +end + +function HCA:setbits (offset, hi, lo, value) + local base = getint(self.entry, offset) + setint(self.entry, offset, setbits(hi, lo, value, base)) +end + +function HCA:input (name, offset, hi, lo, value) + assert(offset % 4 == 0) + if debug_trace and name then + print(("input @ %4xh (%2d:%2d) %-20s = %10xh (%d)"):format(offset, hi, lo, name, value, value)) + end + if offset > self.input_size-4 then + error(("input offset out of bounds: %sh > %sh"):format( + bit.tohex(offset, 4), bit.tohex(self.input_size-4, 4))) + end + if offset <= 16 - 4 then -- inline + self:setbits(0x10 + offset, hi, lo, value) + else + local mailbox_number = math.floor((offset - 16) / data_per_mailbox) + local mailbox_offset = (offset - 16) % data_per_mailbox + local base = getint(self.inboxes[mailbox_number], mailbox_offset) + local newvalue = setbits(hi, lo, value, base) + setint(self.inboxes[mailbox_number], mailbox_offset, newvalue) + end + return self -- for method call chaining +end + +function HCA:output (offset, hi, lo) + if offset <= 16 - 4 then --inline + return self:getbits(0x20 + offset, hi, lo) + else + local mailbox_number = math.floor((offset - 16) / data_per_mailbox) + local mailbox_offset = (offset - 16) % data_per_mailbox + return getbits(getint(self.outboxes[mailbox_number], mailbox_offset), hi, lo) + end +end + +function HCA:output64 (offset) + local high = self:output(offset, 31, 0) + 0ULL + local low = band(self:output(offset+4, 31, 0) + 0ULL, 0xFFFFFFFF) + return shl(high, 32) + low +end + + + +function HCA:setinbits (ofs, ...) --bit1, bit2, val, ... + assert(ofs % 4 == 0) + if ofs <= 16 - 4 then --inline + self:setbits(0x10 + ofs, ...) + else --input mailbox + local mailbox = math.floor((ofs - 16) / data_per_mailbox) + local offset = (ofs - 16) % data_per_mailbox + setint(self.inboxes[mailbox], offset, setbits(...)) + end +end + +function HCA:getoutbits (ofs, bit2, bit1) + if ofs <= 16 - 4 then --inline + return self:getbits(0x20 + ofs, bit2, bit1) + else --output mailbox + local mailbox = math.floor((ofs - 16) / data_per_mailbox) + local offset = (ofs - 16) % data_per_mailbox + local b = getbits(getint(self.outboxes[mailbox], offset), bit2, bit1) + return b + end +end + +-- "Command delivery status" error codes. +local delivery_errors = { + [0x00] = 'no errors', + [0x01] = 'signature error', + [0x02] = 'token error', + [0x03] = 'bad block number', + [0x04] = 'bad output pointer. pointer not aligned to mailbox size', + [0x05] = 'bad input pointer. pointer not aligned to mailbox size', + [0x06] = 'internal error', + [0x07] = 'input len error. input length less than 0x8', + [0x08] = 'output len error. output length less than 0x8', + [0x09] = 'reserved not zero', + [0x10] = 'bad command type', + -- Note: Suspicious to jump from 0x09 to 0x10 here i.e. skipping 0x0A - 0x0F. + -- This is consistent with both the PRM and the Linux mlx5_core driver. +} + +local function checkz (z) + if z == 0 then return end + error('command error: '..(delivery_errors[z] or z)) +end + +-- Command error code meanings. +-- Note: This information is missing from the PRM. Can compare with Linux mlx5_core. +local command_errors = { + -- General: + [0x01] = 'INTERNAL_ERR: internal error', + [0x02] = 'BAD_OP: Operation/command not supported or opcode modifier not supported', + [0x03] = 'BAD_PARAM: parameter not supported; parameter out of range; reserved not equal 0', + [0x04] = 'BAD_SYS_STATE: System was not enabled or bad system state', + [0x05] = 'BAD_RESOURCE: Attempt to access reserved or unallocated resource, or resource in inappropriate status. for example., not existing CQ when creating QP', + [0x06] = 'RESOURCE_BUSY: Requested resource is currently executing a command. No change in any resource status or state i.e. command just not executed.', + [0x08] = 'EXCEED_LIM: Required capability exceeds device limits', + [0x09] = 'BAD_RES_STATE: Resource is not in the appropriate state or ownership', + [0x0F] = 'NO_RESOURCES: Command was not executed because lack of resources (for example ICM pages). This is unrecoverable situation from driver point of view', + [0x50] = 'BAD_INPUT_LEN: Bad command input len', + [0x51] = 'BAD_OUTPUT_LEN: Bad command output len', + -- QP/RQ/SQ/TIP: + [0x10] = 'BAD_RESOURCE_STATE: Attempt to modify a Resource (RQ/SQ/TIP/QPs) which is not in the presumed state', + -- MAD: + [0x30] = 'BAD_PKT: Bad management packet (silently discarded)', + -- CQ: + [0x40] = 'BAD_SIZE: More outstanding CQEs in CQ than new CQ size', +} + +function HCA:post () + self:setbits(0x3C, 0, 0, 1) + self.init_seg:ring_doorbell(self.q) +end + +function HCA:execute_async () + if debug_hexdump then + local dumpoffset = 0 + print("command INPUT:") + dumpoffset = hexdump(self.entry, 0, 0x40, dumpoffset) + local ninboxes = math.ceil((self.input_size + 4 - 16) / data_per_mailbox) + for i = 0, ninboxes-1 do + local blocknumber = getint(self.inboxes[i], 0x238, 31, 0) + local address = memory.virtual_to_physical(self.inboxes[i]) + print("Block "..blocknumber.." @ "..bit.tohex(address, 12)..":") + dumpoffset = hexdump(self.inboxes[i], 0, ffi.sizeof(cmdq_mailbox_t), dumpoffset) + end + end + assert(self:getbits(0x3C, 0, 0) == 1) + self:post() +end + +function HCA:completed () + if self:getbits(0x3C, 0, 0) == 0 then + if debug_hexdump then + local dumpoffset = 0 + print("command OUTPUT:") + dumpoffset = hexdump(self.entry, 0, 0x40, dumpoffset) + local noutboxes = math.ceil((self.output_size + 4 - 16) / data_per_mailbox) + for i = 0, noutboxes-1 do + local blocknumber = getint(self.outboxes[i], 0x238, 31, 0) + local address = memory.virtual_to_physical(self.outboxes[i]) + print("Block "..blocknumber.." @ "..bit.tohex(address, 12)..":") + dumpoffset = hexdump(self.outboxes[i], 0, ffi.sizeof(cmdq_mailbox_t), dumpoffset) + end + end + + local token = self:getbits(0x3C, 31, 24) + local signature = self:getbits(0x3C, 23, 16) + local status = self:getbits(0x3C, 7, 1) + + checkz(status) + self:checkstatus() + + return signature, token + else + if self.init_seg:getbits(0x1010, 31, 24) ~= 0 then + error("HCA health syndrome: " .. bit.tohex(self.init_seg:getbits(0x1010, 31, 24))) + end + return nil, nil + end +end + +function HCA:execute () + self:execute_async() + local signature, token = self:completed() + --poll for command completion + while not signature do + C.usleep(10000) + signature, token = self:completed() + end + return signature, token +end + +-- see 12.2 Return Status Summary +function HCA:checkstatus () + local status = self:getoutbits(0x00, 31, 24) + local syndrome = self:getoutbits(0x04, 31, 0) + if status == 0 then return end + error(string.format('status: 0x%x (%s), syndrome: 0x%x', + status, command_errors[status], syndrome)) +end + + + +--------------------------------------------------------------- +-- Initialization segment access. +-- +-- The initialization segment is a region of memory-mapped PCI +-- registers. This is an interface directly to the hardware and is +-- used for bootstrapping communication with the firmware (amongst +-- other things). +-- +-- Described in the "Initialization Segment" section of the PRM. +--------------------------------------------------------------- + +InitializationSegment = {} + +-- Create an initialization segment object. +-- ptr is a pointer to the memory-mapped registers. +function InitializationSegment:new (ptr) + return setmetatable({ptr = cast('uint32_t*', ptr)}, {__index = InitializationSegment}) +end + +function InitializationSegment:getbits (offset, hi, lo) + return getbits(getint(self.ptr, offset), hi, lo) +end + +function InitializationSegment:setbits (offset, hi, lo, value) + setint(self.ptr, offset, setbits(hi, lo, value, 0)) +end + +function InitializationSegment:fw_rev () --maj, min, subminor + return + self:getbits(0, 15, 0), + self:getbits(0, 31, 16), + self:getbits(4, 15, 0) +end + +function InitializationSegment:cmd_interface_rev () + return self:getbits(4, 31, 16) +end + +function InitializationSegment:cmdq_phy_addr (addr) + if addr then + --must write the MSB of the addr first + self:setbits(0x10, 31, 0, ptrbits(addr, 63, 32)) + --also resets nic_interface and log_cmdq_* + self:setbits(0x14, 31, 12, ptrbits(addr, 31, 12)) + else + return cast('void*', + cast('uint64_t', self:getbits(0x10, 31, 0) * 2^32 + + cast('uint64_t', self:getbits(0x14, 31, 12)) * 2^12)) + end +end + +function InitializationSegment:nic_interface (mode) + self:setbits(0x14, 9, 8, mode) +end + +function InitializationSegment:log_cmdq_size () + return self:getbits(0x14, 7, 4) +end + +function InitializationSegment:log_cmdq_stride () + return self:getbits(0x14, 3, 0) +end + +function InitializationSegment:ring_doorbell (i) + self:setbits(0x18, i, i, 1) +end + +function InitializationSegment:ready (i, val) + return self:getbits(0x1fc, 31, 31) == 0 +end + +function InitializationSegment:nic_interface_supported () + return self:getbits(0x1fc, 26, 24) == 0 +end + +function InitializationSegment:internal_timer () + return + self:getbits(0x1000, 31, 0) * 2^32 + + self:getbits(0x1004, 31, 0) +end + +function InitializationSegment:clear_int () + self:setbits(0x100c, 0, 0, 1) +end + +function InitializationSegment:health_syndrome () + return self:getbits(0x1010, 31, 24) +end + +function InitializationSegment:reset () + -- Not covered in PRM + self:setbits(0x14, 10, 8, 0x7) +end + +function InitializationSegment:dump () + print('fw_rev ', self:fw_rev()) + print('cmd_interface_rev ', self:cmd_interface_rev()) + print('cmdq_phy_addr ', self:cmdq_phy_addr()) + print('log_cmdq_size ', self:log_cmdq_size()) + print('log_cmdq_stride ', self:log_cmdq_stride()) + print('ready ', self:ready()) + print('nic_interface_supported ', self:nic_interface_supported()) + print('internal_timer ', self:internal_timer()) + print('health_syndrome ', self:health_syndrome()) +end + + +--------------------------------------------------------------- +-- Utilities. +--------------------------------------------------------------- + +-- Print a hexdump in the same format as the Linux kernel mlx5 driver. +-- +-- Optionally take a 'dumpoffset' giving the logical address where the +-- trace starts (useful when printing multiple related hexdumps i.e. +-- for consistency with the Linux mlx5_core driver format). +function hexdump (pointer, index, bytes, dumpoffset) + local u8 = ffi.cast("uint8_t*", pointer) + dumpoffset = dumpoffset or 0 + for i = 0, bytes-1 do + if i % 16 == 0 then + if i > 0 then io.stdout:write("\n") end + io.stdout:write(("%03x: "):format(dumpoffset+i)) + elseif i % 4 == 0 then + io.stdout:write(" ") + end + io.stdout:write(bit.tohex(u8[index+i], 2)) + end + io.stdout:write("\n") + io.flush() + return dumpoffset + bytes +end + +-- Utilities for peeking and poking bitfields of 32-bit big-endian integers. +-- Pointers are uint32_t* and offsets are in bytes. + +-- Return the value at offset from address. +function getint (pointer, offset) + assert(offset % 4 == 0, "offset not dword-aligned") + local r = bswap(pointer[offset/4]) + return r +end + +-- Set the the value at offset from address. +function setint (pointer, offset, value) + assert(offset % 4 == 0, "offset not dword-aligned") + pointer[offset/4] = bswap(tonumber(value)) +end + +-- Return the hi:lo bits of value. +function getbits (value, hi, lo) + local mask = shl(2^(hi-lo+1)-1, lo) + local r = shr(band(value, mask), lo) + --print("getbits", bit.tohex(value), hi, lo, bit.tohex(r)) + return r +end + +-- Return the hi:lo bits of a pointer. +function ptrbits (pointer, hi, lo) + return tonumber(getbits(cast('uint64_t', pointer), hi, lo)) +end + +-- Set value in bits hi:lo of (optional) base. +function setbits (hi, lo, value, base) + base = base or 0 + local mask = shl(2^(hi-lo+1)-1, lo) + local newbits = band(shl(value, lo), mask) + local oldbits = band(base, bnot(mask)) + return bor(newbits, oldbits) +end + +function log2size (size) + -- Return log2 of size rounded up to nearest whole number. + -- + -- Note: Lua provides only natural logarithm function (base e) built-in. + -- See http://www.mathwords.com/c/change_of_base_formula.htm + return math.ceil(math.log(size) / math.log(2)) +end + +function check_pow2 (num) + return bit.band(num, num - 1) == 0 +end + +function selftest () + io.stdout:setvbuf'no' + + local pcidev0 = lib.getenv("SNABB_PCI_CONNECTX_0") + local pcidev1 = lib.getenv("SNABB_PCI_CONNECTX_1") + -- XXX check PCI device type + if not pcidev0 then + print("SNABB_PCI_CONNECTX_0 not set") + os.exit(engine.test_skipped_code) + end + if not pcidev1 then + print("SNABB_PCI_CONNECTX_1 not set") + os.exit(engine.test_skipped_code) + end + + local io0 = IO:new({pciaddress = pcidev0, queue = 'a'}) + local io1 = IO:new({pciaddress = pcidev1, queue = 'b'}) + io0.input = { input = link.new('input0') } + io0.output = { output = link.new('output0') } + io1.input = { input = link.new('input1') } + io1.output = { output = link.new('output1') } + -- Exercise the IO apps before the NIC is initialized. + io0:pull() io0:push() io1:pull() io1:push() + local nic0 = ConnectX:new{pciaddress = pcidev0, queues = {{id='a'}}} + local nic1 = ConnectX:new{pciaddress = pcidev1, queues = {{id='b'}}} + + print("selftest: waiting for both links up") + while (nic0.hca:query_vport_state().oper_state ~= 1) or + (nic1.hca:query_vport_state().oper_state ~= 1) do + C.usleep(1e6) + end + + local bursts = 10000 + local each = 100 + local octets = 100 + print(("Links up. Sending %s packets."):format(lib.comma_value(each*bursts))) + + for i = 1, bursts + 100 do + for id, app in ipairs({io0, io1}) do + if i <= bursts then + for i = 1, each do + local p = packet.allocate() + ffi.fill(p.data, octets, 0) -- zero packet + local header = lib.hexundump("000000000001 000000000002 0800", 14) + ffi.copy(p.data, header, #header) + p.data[12] = 0x08 -- ethertype = 0x0800 + p.length = octets + link.transmit(app.input.input, p) + end + end + app:pull() + app:push() + while not link.empty(io0.output.output) do packet.free(link.receive(io0.output.output)) end + while not link.empty(io1.output.output) do packet.free(link.receive(io1.output.output)) end + end + end + print("link", "txpkt", "txbyte", "txdrop") + local i0 = io0.input.input + local i1 = io1.input.input + local o0 = io0.output.output + local o1 = io1.output.output + print("send0", tonumber(counter.read(i0.stats.txpackets)), tonumber(counter.read(i0.stats.txbytes)), tonumber(counter.read(i0.stats.txdrop))) + print("send1", tonumber(counter.read(i1.stats.txpackets)), tonumber(counter.read(i1.stats.txbytes)), tonumber(counter.read(i1.stats.txdrop))) + print("recv0", tonumber(counter.read(o0.stats.txpackets)), tonumber(counter.read(o0.stats.txbytes)), tonumber(counter.read(o0.stats.txdrop))) + print("recv1", tonumber(counter.read(o1.stats.txpackets)), tonumber(counter.read(o1.stats.txbytes)), tonumber(counter.read(o1.stats.txdrop))) + + -- print("payload snippets of first 5 packets") + -- print("port0") + -- for i = 1, 5 do + -- local p = link.receive(o0) + -- if p then print(p.length, lib.hexdump(ffi.string(p.data, math.min(32, p.length)))) end + -- end + -- print("port1") + -- for i = 1, 5 do + -- local p = link.receive(o1) + -- if p then print(p.length, lib.hexdump(ffi.string(p.data, math.min(32, p.length)))) end + -- end + + print() + print(("%-16s %20s %20s"):format("hardware counter", pcidev0, pcidev1)) + print("---------------- -------------------- --------------------") + + local stat0 = nic0.hca:query_vport_counter() + local stat1 = nic1.hca:query_vport_counter() + + -- Sort into key order + local t = {} + for k in pairs(stat0) do table.insert(t, k) end + table.sort(t) + for _, k in pairs(t) do + print(("%-16s %20s %20s"):format(k, lib.comma_value(stat0[k]), lib.comma_value(stat1[k]))) + end + + nic0:stop() + nic1:stop() + io0:stop() + io1:stop() + + if (stat0.tx_ucast_packets == bursts*each and stat0.tx_ucast_octets == bursts*each*octets and + stat1.tx_ucast_packets == bursts*each and stat1.tx_ucast_octets == bursts*each*octets) then + print("selftest: ok") + else + error("selftest failed: unexpected counter values") + end +end diff --git a/src/apps/mellanox/connectx_test.lua b/src/apps/mellanox/connectx_test.lua new file mode 100644 index 0000000000..cba693ddcd --- /dev/null +++ b/src/apps/mellanox/connectx_test.lua @@ -0,0 +1,343 @@ +-- Test suite for the Mellanox ConnectX driver. +-- Use of this source code is governed by the Apache 2.0 license; see COPYING. +module(..., package.seeall) + +local ffi = require("ffi") +local C = ffi.C +local connectx = require("apps.mellanox.connectx") +local counter = require("core.counter") +local lib = require("core.lib") + +-- Test scenarios: +-- unicast-multiqueue +-- number of queues + +-- Test sending traffic between two directly attached network interfaces. +-- +-- pci0, pci1: device PCI addresses +-- npackets: number of packets to transfer (lower bound) +-- ncores: number of CPU cores per network interface +-- minlen: minimum packet length (excl. ethernet FCS) +-- maxlen: maximum packet length +-- minburst: minimum burst size (packets) sent to the driver +-- maxburst: maximum burst size +-- macs: number of unique mac addresses +-- vlans: number of unique VLAN IDs +-- rss: number of RSS hash buckets. +-- +-- Hardware queue count will be macs*vlans*rss on each interface. +function switch (pci0, pci1, npackets, ncores, minlen, maxlen, minburst, maxburst, macs, vlans, rss) + print("selftest: connectx_test switch") + assert(ncores == 1, "multicore not yet handled") + -- Create queue definitions + local queues = {} + for vlan = 1, vlans do + for mac = 1, macs do + for q = 1, rss do + local id = ("vlan%d.mac%d.rss%d"):format(vlan, mac, q) + queues[#queues+1] = {id=id, vlan=vlan, mac="00:00:00:00:00:"..bit.tohex(mac, 2)} + end + end + end + -- Instantiate app network + local nic0 = connectx.ConnectX:new({pciaddress=pci0, queues=queues}) + local nic1 = connectx.ConnectX:new({pciaddress=pci1, queues=queues}) + local io0 = {} -- io apps on nic0 + local io1 = {} -- io apps on nic1 + print(("creating %d queues per device..."):format(#queues)) + for _, queue in ipairs(queues) do + local function ioapp (pci, queue) + local a = connectx.IO:new({pciaddress=pci, queue=queue.id}) + a.input = { input = link.new(("input-%s-%s" ):format(pci, queue.id)) } + a.output = { output = link.new(("output-%s-%s"):format(pci, queue.id)) } + return a + end + io0[queue.id] = ioapp(pci0, queue) + io1[queue.id] = ioapp(pci1, queue) + end + -- Create diverse packet payload templates + print("creating payloads...") + local payload = {} + local npayloads = 1000 + for i = 1, npayloads do + local p = packet.allocate() + payload[i] = p + p.length = between(minlen, maxlen) + ffi.fill(p.data, p.length, 0) + + -- MAC destination + local r = math.random() + if r < 0.05 then -- 5% of packets are broadcast + ffi.fill(p.data, 6, 0xFF) + elseif r < 0.10 then -- 5% of packets are multicast + p.data[0], p.data[1] = 0x33, 0x33 -- "locally administered" multicast + elseif r < 0.20 then -- 10% are unicast to random destinations + for i = 1, 5 do p.data[i] = math.random(256) - 1 end + else -- rest are unicast to known mac + p.data[5] = between(1, macs) + end + + -- MAC source + for i = 7, 11 do p.data[i] = math.random(256) - 1 end + + -- 802.1Q + p.data[12] = 0x81 + p.data[15] = between(1, vlans) -- vlan id can be out of expected range + p.data[16] = 0x08 -- ipv4 + + local ip_ofs = 18 + + -- IPv4 + local ip = require("lib.protocol.ipv4"):new{ + src = lib.random_bytes(4), + dst = lib.random_bytes(4), + ttl = 64 + } + if r < 0.50 then -- 50% of packets are UDP (have L4 header) + ip:protocol(17) -- UDP + else -- rest have random payloads + ip:protocol(253) + end + ip:copy(p.data+ip_ofs, 'relocate') + ip:total_length(p.length-ip_ofs) + ip:checksum() + + if ip:protocol() == 17 then + -- UDP + local udp = require("lib.protocol.udp"):new{ + src_port = math.random(30000), + dst_port = math.random(30000) + } + udp:copy(p.data+ip_ofs+ip:sizeof(), 'relocate') + udp:length(p.length-(ip_ofs+ip:sizeof())) + + -- Random payload + for i = ip_ofs+ip:sizeof()+udp:sizeof(), p.length-1 do + p.data[i] = math.random(256) - 1 + end + + -- UDP checksum + udp:checksum(p.data, p.length-(ip_ofs+ip:sizeof()+udp:sizeof()), ip) + + else + -- Random payload + for i = ip_ofs+ip:sizeof(), p.length-1 do + p.data[i] = math.random(256) - 1 + end + end + + --print(lib.hexdump(ffi.string(p.data, 32))) + end + -- Wait for linkup on both ports + print("waiting for linkup...") + while not (nic0.hca:linkup() and nic1.hca:linkup()) do C.usleep(0.25e6) end + -- Send packets + print("sending packets...") + + local function dump (pci, id, app) + -- Dump received packets + while not link.empty(app.output.output) do + local p = link.receive(app.output.output) + --print(("recv %s %4d %s: %s"):format(pci, p.length, id, lib.hexdump(ffi.string(p.data, 32)))) + packet.free(p) + end + end + + local start = engine.now() + local remaining = npackets + engine.vmprofile_enabled = true + engine.setvmprofile("connectx") + while remaining > 0 do + -- Send packets + for id, _ in pairs(io0) do + for i = 1, between(minburst, maxburst) do + if remaining > 0 then + local p = payload[between(1, npayloads)] + --print(("send(%4d): %s"):format(p.length, lib.hexdump(ffi.string(p.data, 32)))) + link.transmit(io0[id].input.input, packet.clone(p)) + link.transmit(io1[id].input.input, packet.clone(p)) + remaining = remaining - 1 + end + end + end + -- Simulate breathing + --C.usleep(100) + for id, app in pairs(io0) do app:pull() app:push() dump(pci0, id, app) end + for id, app in pairs(io1) do app:pull() app:push() dump(pci1, id, app) end + -- Simulate breathing + end + engine.setvmprofile("engine") + -- Receive any last packets + C.usleep(100) + for i = 1, 10 do + for id, app in pairs(io0) do app:pull() app:push() dump(pci0, id, app) end + for id, app in pairs(io1) do app:pull() app:push() dump(pci1, id, app) end + end + local finish = engine.now() + print("reporting...") + print(("%-16s %20s %20s"):format("hardware counter", pci0, pci1)) + print("---------------- -------------------- --------------------") + local stat0 = nic0.hca:query_vport_counter() + local stat1 = nic1.hca:query_vport_counter() + -- Sort into key order + local t = {} + for k in pairs(stat0) do table.insert(t, k) end + table.sort(t) + for _, k in pairs(t) do + print(("%-16s %20s %20s"):format(k, lib.comma_value(stat0[k]), lib.comma_value(stat1[k]))) + end + + local received = {[pci0]={}, [pci1]={}} + print(("@@ %16s; %12s; %12s; %12s; %12s; %12s; %12s; %12s"):format( + "nic", "link", "txpkt", "txbyte", "txdrop", "rxpkt", "rxbyte", "rxdrop")) + -- Sort into key order + local t = {} + for k in pairs(io0) do table.insert(t, k) end + table.sort(t) + for _, id in pairs(t) do + local function prlink (nic, id, app) + local function count (cnt) return tonumber(counter.read(cnt)) end + local stx = app.input.input.stats + local srx = app.output.output.stats + print(("@@ %16s; %12s; %12d; %12d; %12d; %12d; %12d; %12d"):format( + nic, id, + count(stx.txpackets), count(stx.txbytes), count(stx.txdrop), + count(srx.txpackets), count(srx.txbytes), count(srx.txdrop))) + received[nic][#received[nic]+1] = count(srx.txpackets) + end + prlink(pci0, id, io0[id]) + prlink(pci1, id, io1[id]) + end + print(("time: %.1fs - Mpps: %.3f per NIC"):format(finish-start, npackets/1e6/(finish-start))) + + print("hardware counter check") + assert(stat0.tx_ucast_packets+stat0.tx_mcast_packets+stat0.tx_bcast_packets == npackets, "0: sent too little") + assert(stat1.tx_ucast_packets+stat1.tx_mcast_packets+stat1.tx_bcast_packets == npackets, "1: sent too little") + assert(stat0.tx_ucast_packets == stat1.rx_ucast_packets, "0.tx_ucast != 1.rx_ucast") + assert(stat1.tx_ucast_packets == stat0.rx_ucast_packets, "1.tx_ucast != 0.rx_ucast") + assert(stat0.tx_mcast_packets*2 == stat1.rx_mcast_packets, "0.tx_mcast*2 != 1.rx_mcast") + assert(stat1.tx_mcast_packets*2 == stat0.rx_mcast_packets, "1.tx_mcast*2 != 0.rx_mcast") + assert(stat0.tx_bcast_packets*2 == stat1.rx_bcast_packets, "0.tx_bcast*2 != 1.rx_bcast") + assert(stat1.tx_bcast_packets*2 == stat0.rx_bcast_packets, "1.tx_bcast*2 != 0.rx_bcast") + + for _, nic in pairs{pci0, pci1} do + local sum, avg, sd = sum(received[nic]), mean(received[nic]), stdev(received[nic]) + print(("RX check %s sum=%d avg=%.1f sd=%.1f") + :format(nic, sum, avg, sd)) + -- expect some slack because we send 10% to random MACs + assert(sum >= npackets*.8, "received too little") + -- expect more packets on queues 0 because we send 10% mcast, + -- but mostly even distribution of packets + assert(sd / avg < .2, "uneven packet distribution") + end + + nic0:stop() + nic1:stop() + for _, queue in ipairs(queues) do + io0[queue.id]:stop() + link.free(io0[queue.id].input.input, ("input-%s-%s" ):format(pci0, queue.id)) + link.free(io0[queue.id].output.output, ("output-%s-%s" ):format(pci0, queue.id)) + io1[queue.id]:stop() + link.free(io1[queue.id].input.input, ("input-%s-%s" ):format(pci1, queue.id)) + link.free(io1[queue.id].output.output, ("output-%s-%s" ):format(pci1, queue.id)) + end + + print("selftest: done") +end + +-- Return a random number between min and max (inclusive.) +function between (min, max) + if min == max then + return min + else + return min + math.random(max-min+1) - 1 + end +end + +function sum (values) + local sum = 0 + for _, value in ipairs(values) do + sum = sum + value + end + return sum +end + +function mean (values) + return sum(values) / #values +end + +function stdev (values) + local avg = mean(values) + local var = {} + for _, value in ipairs(values) do + var[#var+1] = (value-avg)^2 + end + return math.sqrt(mean(var)) +end + +function basic_match (pci0, pci1) + print("selftest: connectx_test match") + + local packet_count = 1001 + local src, dst = "00:00:00:00:00:01", "00:00:00:00:00:02" + + local basic = require("apps.basic.basic_apps") + local match = require("apps.test.match") + local npackets = require("apps.test.npackets") + local synth = require("apps.test.synth") + local counter = require("core.counter") + + local c = config.new() + config.app(c, "synth", synth.Synth, { + sizes={64,67,128,133,192,256,384,512,777,1024}, + src=src, + dst=dst, + random_payload=true + }) + config.app(c, "tee", basic.Tee) + config.app(c, "match", match.Match) + config.app(c, "npackets", npackets.Npackets, {npackets=packet_count}) + config.app(c, "nic0", connectx.ConnectX, { + pciaddress=pci0, + queues={{id="io0", mac=src}} + }) + config.app(c, "io0", connectx.IO, {pciaddress=pci0, queue="io0"}) + config.app(c, "nic1", connectx.ConnectX, { + pciaddress=pci1, + queues={{id="io1", mac=dst}} + }) + config.app(c, "io1", connectx.IO, {pciaddress=pci1, queue="io1"}) + + config.link(c, "synth.output -> npackets.input") + config.link(c, "npackets.output -> tee.input") + config.link(c, "tee.output1 -> io0.input") + config.link(c, "io1.output -> match.rx") + config.link(c, "tee.output2 -> match.comparator") + + engine.configure(c) + + engine.main({duration = 1, report = false}) + engine.report_links() + engine.report_apps() + + local m = engine.app_table['match'] + assert(#m:errors() == 0, "Corrupt packets.") + + engine.configure(config.new()) + + print("selftest: done") +end + +function selftest () + local pci0 = os.getenv("SNABB_PCI_CONNECTX_0") + local pci1 = os.getenv("SNABB_PCI_CONNECTX_1") + if not (pci0 and pci1) then + print("SNABB_PCI_CONNECTX_0 and SNABB_PCI_CONNECTX_1 must be set. Skipping selftest.") + os.exit(engine.test_skipped_code) + end + basic_match(pci0, pci1) + switch(pci0, pci1, 10e6, 1, 60, 1500, 100, 100, 2, 2, 4) + switch(pci0, pci1, 10e6, 1, 60, 1500, 100, 100, 1, 2, 8) + switch(pci0, pci1, 10e6, 1, 60, 1500, 100, 100, 4, 1, 4) +end + diff --git a/src/apps/mellanox/trace-mlx5_core.txt.gz b/src/apps/mellanox/trace-mlx5_core.txt.gz new file mode 100644 index 0000000000..ca40de3e74 Binary files /dev/null and b/src/apps/mellanox/trace-mlx5_core.txt.gz differ diff --git a/src/apps/test/README.md b/src/apps/test/README.md index 6d4ef5bb31..28dda5b634 100644 --- a/src/apps/test/README.md +++ b/src/apps/test/README.md @@ -79,6 +79,11 @@ Generate a random payload for each packet in `sizes`. Insert the packet number (32bit uint) directly after the ethertype. The packet number starts at 0 and is sequential on each output link. +— Key **packets** + +Emit *packets* (an array of *packets*) instead of synthesizing packets. When +this option is used *src*, *dst*, *sizes*, and *random_payload* are ignored. + ## Npackets (apps.test.npackets) The `Npackets` app allows are most N packets to flow through it. Any further diff --git a/src/apps/test/synth.lua b/src/apps/test/synth.lua index c75ebd2e93..d4558dc14d 100644 --- a/src/apps/test/synth.lua +++ b/src/apps/test/synth.lua @@ -15,48 +15,56 @@ Synth = { dst = {default='00:00:00:00:00:00'}, random_payload = { default = false }, packet_id = { default = false }, + packets = {} } } function Synth:new (conf) assert(#conf.sizes >= 1, "Needs at least one size.") - local packets = {} - for i, size in ipairs(conf.sizes) do - local payload_size = size - ethernet:sizeof() - assert(payload_size >= 0 and payload_size <= 1536, - "Invalid payload size: "..payload_size) - local data - if conf.random_payload then - data = lib.random_bytes(payload_size) - else - data = ffi.new("char[?]", payload_size) + local packets = conf.packets + if not packets then + packets = {} + for i, size in ipairs(conf.sizes) do + local payload_size = size - ethernet:sizeof() + assert(payload_size >= 0 and payload_size <= 1536, + "Invalid payload size: "..payload_size) + local data + if conf.random_payload then + data = lib.random_bytes(payload_size) + else + data = ffi.new("char[?]", payload_size) + end + local dgram = datagram:new(packet.from_pointer(data, payload_size)) + local ether = ethernet:new({ src = ethernet:pton(conf.src), + dst = ethernet:pton(conf.dst), + type = payload_size }) + dgram:push(ether) + packets[i] = dgram:packet() end - local dgram = datagram:new(packet.from_pointer(data, payload_size)) - local ether = ethernet:new({ src = ethernet:pton(conf.src), - dst = ethernet:pton(conf.dst), - type = payload_size }) - dgram:push(ether) - packets[i] = dgram:packet() end - return setmetatable({packets=packets}, {__index=Synth}) + return setmetatable( + {cursor=0, pktid=(conf.packet_id and 0), packets=packets}, + {__index=Synth} + ) end function Synth:pull () + local burst = engine.pull_npackets + local packets, npackets = self.packets, #self.packets for _, o in ipairs(self.output) do - local n = 0 - while n < engine.pull_npackets do - for _, p in ipairs(self.packets) do - local c = packet.clone(p) - if self.packet_id then - -- 14 == sizeof(dstmac srcmac type) - ffi.cast("uint32_t *", clone.data+14)[0] = lib.htonl(self.pktid) - self.pktid = self.pktid + 1 - end - transmit(o, c) - n = n + 1 - end + local cursor = self.cursor + for _ = 1, burst do + local p = packet.clone(packets[1+cursor]) + if self.packet_id then + -- 14 == sizeof(dstmac srcmac type) + ffi.cast("uint32_t *", p.data+14)[0] = lib.htonl(self.pktid) + self.pktid = self.pktid + 1 + end + transmit(o, p) + cursor = (cursor + 1) % npackets end end + self.cursor = (self.cursor + burst) % npackets end function Synth:stop () diff --git a/src/apps/xdp/README.md b/src/apps/xdp/README.md new file mode 100644 index 0000000000..76af18e5d2 --- /dev/null +++ b/src/apps/xdp/README.md @@ -0,0 +1,75 @@ +# XDP socket app (apps.xdp.xdp) + +The `XDP` app implements a driver for Linux `AF_XDP` sockets. + +Its links are named `input` and `output`. + + DIAGRAM: XDP + +-----------+ + | | + input ---->* XDP *----> output + | | + +-----------+ + +**Important:** To use the _XDP_ app, “Snabb XDP mode“ must be enabled by +calling `xdp.snabb_enable_xdp()`. Calling this function replaces Snabb's native +memory allocator with the _UMEM_ allocator. The caller must ensure that no +packets have been allocated via `packet.allocate()` prior to calling this +function. + +## _Caveats_ + + * Memory allocated by the UMEM allocator can not be used with _DMA_ + drivers: using the XDP app precludes the use of Snabb’s native + hardware drivers such as `apps.intel_mp.intel_mp`. + + * Memory allocated by the UMEM allocator can not be shared with + other Snabb processes in the same process group: using + snabb_enable_xdp precludes the use of Interlink apps + (`apps.interlink`). + +## Maximum MTU + +Due to a combination of how Snabb uses packet buffers and a limitation of +`AF_XDP` the effective maximum MTU of the XDP app is 3,582. + +## Configuration + +— Key **ifname** + +*Required*. The name of the interface as shown in `ip link`. + +— Key **filter** + +*Optional*. A `pcap-filter(7)` expression. If given, packets that do not match +the filter will we passed on to the host networking stack. Must be the same for +all instances of the XDP app on a given interface! + +— Key **queue** + +*Optional*. Queue to bind to (zero based). The default is queue 0. + +## Module functions + +— Function **snabb_enable_xdp** *options* + +Enables “Snabb XDP mode”. See _Caveats_! + +### *Options* + +*Options* is a table of configuration options. The following parameters are +supported: + + - `num_chunks`—number of UMEM chunks to allocate. The default is 200,000 which + might not be enough depending on the number of XDP sockets used by the + process. Each instance of the XDP app uses up to around 25,000 chunks at any + time. However, generous over-provisioning (at least double of the expected + residency) is recommended due to buffering in the Snabb engine. + +## Setting up XDP capable devices under Linux + +``` +$ echo 0000:01:00.0 > /sys/bus/pci/drivers/ixgbe/bind +$ ip link set ens1f0 addr 02:00:00:00:00:00 +$ ethtool --set-channels ens1f0 combined 1 +``` diff --git a/src/apps/xdp/bpf.lua b/src/apps/xdp/bpf.lua new file mode 100644 index 0000000000..34107a9dda --- /dev/null +++ b/src/apps/xdp/bpf.lua @@ -0,0 +1,300 @@ +-- Use of this source code is governed by the Apache 2.0 license; see COPYING. + +module(...,package.seeall) + +local ffi = require("ffi") +local band, bor = bit.band, bit.bor + +-- BPF: just enough eBPF to assemble trivial XDP programs. +-- +-- See "BPF Architecture": +-- https://docs.cilium.io/en/v1.6/bpf/#bpf-architecture +-- +-- See Linux v4.19: +-- include/uapi/linux/bpf_common.h +-- include/uapi/linux/bpf.h +-- tools/include/linux/filter.h + +ins = ffi.typeof[[ + struct { + uint8_t op; /* opcode */ + uint8_t dst:4; /* dest register */ + uint8_t src:4; /* source register */ + int16_t off; /* signed offset */ + int32_t imm; /* signed immediate constant */ + } __attribute__((packed)) +]] + +c = { -- Op class + LD = 0x00, + LDX = 0x01, + ST = 0x02, + STX = 0x03, + ALU = 0x04, + JMP = 0x05, + RET = 0x06, + ALU64 = 0x07, -- alu mode in double word width + mask = 0x07 +} + +f = { -- Load/store width + W = 0x00, -- 32-bit + H = 0x08, -- 16-bit + B = 0x10, -- 8-bit + DW = 0x18, -- 64-bit + mask = 0x18 +} + +m = { -- Op mode + IMM = 0x00, + ABS = 0x20, + IND = 0x40, + MEM = 0x60, + LEN = 0x80, + MSH = 0xa0, + XADD = 0xc0, -- exclusive add + mask = 0xe0 +} + +a = { -- ALU mode + ADD = 0x00, + SUB = 0x10, + MUL = 0x20, + DIV = 0x30, + OR = 0x40, + AND = 0x50, + LSH = 0x60, + RSH = 0x70, + NEG = 0x80, + MOD = 0x90, + XOR = 0xa0, + MOV = 0xb0, + END = 0xd0, -- Endianness conversion: + LE = 0x00, -- * to little endian + BE = 0x08, -- * to big endian + mask = 0xf0 +} + +s = { -- Src mode + K = 0x00, + X = 0x08, + MAP_FD = 0x01, + mask = 0x08 +} + +j = { -- JMP mode + JA = 0x00, + JEQ = 0x10, + JGT = 0x20, + JGE = 0x30, + JSET = 0x40, + JNE = 0x50, + JLT = 0xa0, + JLE = 0xb0, + JSGT = 0x60, + JSGE = 0x70, + JSLT = 0xc0, + JSLE = 0xd0, + CALL = 0x80, + EXIT = 0x90, + mask = 0xf0 +} + +fn = { -- Built-in helpers + unspec = 0, + map_lookup_elem = 1, + map_update_elem = 2, + map_delete_elem = 3, + probe_read = 4, + ktime_get_ns = 5, + trace_printk = 6, + get_prandom_u32 = 7, + get_smp_processor_id = 8, + skb_store_bytes = 9, + l3_csum_replace = 10, + l4_csum_replace = 11, + tail_call = 12, + clone_redirect = 13, + get_current_pid_tgid = 14, + get_current_uid_gid = 15, + get_current_comm = 16, + get_cgroup_classid = 17, + skb_vlan_push = 18, + skb_vlan_pop = 19, + skb_get_tunnel_key = 20, + skb_set_tunnel_key = 21, + perf_event_read = 22, + redirect = 23, + get_route_realm = 24, + perf_event_output = 25, + skb_load_bytes = 26, + get_stackid = 27, + csum_diff = 28, + skb_get_tunnel_opt = 29, + skb_set_tunnel_opt = 30, + skb_change_proto = 31, + skb_change_type = 32, + skb_under_cgroup = 33, + get_hash_recalc = 34, + get_current_task = 35, + probe_write_user = 36, + current_task_under_cgroup = 37, + skb_change_tail = 38, + skb_pull_data = 39, + csum_update = 40, + set_hash_invalid = 41, + get_numa_node_id = 42, + skb_change_head = 43, + xdp_adjust_head = 44, + probe_read_str = 45, + get_socket_cookie = 46, + get_socket_uid = 47, + set_hash = 48, + setsockopt = 49, + skb_adjust_room = 50, + redirect_map = 51, + sk_redirect_map = 52, + sock_map_update = 53, + xdp_adjust_meta = 54, + perf_event_read_value = 55, + perf_prog_read_value = 56, + getsockopt = 57, + override_return = 58, + sock_ops_cb_flags_set = 59, + msg_redirect_map = 60, + msg_apply_bytes = 61, + msg_cork_bytes = 62, + msg_pull_data = 63, + bind = 64, + xdp_adjust_tail = 65, + skb_get_xfrm_state = 66, + get_stack = 67, + skb_load_bytes_relative = 68, + fib_lookup = 69, + sock_hash_update = 70, + msg_redirect_hash = 71, + sk_redirect_hash = 72, + lwt_push_encap = 73, + lwt_seg6_store_bytes = 74, + lwt_seg6_adjust_srh = 75, + lwt_seg6_action = 76, + rc_repeat = 77, + rc_keydown = 78, + skb_cgroup_id = 79, + get_current_cgroup_id = 80, + get_local_storage = 81, + sk_select_reuseport = 82, + skb_ancestor_cgroup_id = 83, +} + +function asm (insn) return ffi.typeof("$[?]", ins)(#insn, insn) end + +function dis (insn) + local pc = 0 + local function which (v, typ) + return band(v, typ.mask) + end + local function name (x, typ) + for k, v in pairs(typ) do + if k ~= "mask" and x == v then + return k + end + end + end + local function dis_ins (ins) + local str = "" + -- Class + local class = which(ins.op, c) + str = str..name(class, c) + if class <= c.STX then + -- Load/store + local width = which(ins.op, f) + str = str.." "..name(width, f) + local mode = which(ins.op, m) + --str = str.." "..name(mode, m) + str = str..("\tr%d"):format(ins.dst) + if class > c.LDX then + -- Store offset. + str = str..("+%d"):format(ins.off) + end + if mode == m.IMM then + str = str..(" %d %s"):format(ins.imm, name(ins.src, s)) + else + str = str..(" r%d"):format(ins.src) + if class <= c.LDX then + -- Load offset. + str = str..("+%d"):format(ins.off) + end + end + if mode == m.ABS then + str = str..("+%d"):format(ins.imm) + end + elseif class == c.ALU or class == c.ALU64 then + -- ALU + local alu = which(ins.op, a) + str = str.." "..name(alu, a) + local src = which(ins.op, s) + str = str..("\tr%d"):format(ins.dst) + if src == s.K then + -- Immediate operand + str = str..(" %d"):format(ins.imm) + else + -- Register operand + str = str..(" r%d"):format(ins.src) + end + elseif class == c.JMP then + -- Jump + local jmp = which(ins.op, j) + str = str.." "..name(jmp, j) + if jmp == j.EXIT then + elseif jmp == j.CALL then + -- Call + str = str.."\t"..(name(ins.imm, fn) or ("%x"):format(ins.imm)) + else + -- Relative jump + str = str.."\t" + if jmp > j.JA then + -- Conditional + str = str..("r%d"):format(ins.dst) + if which(ins.op, s) == s.K then + -- Immediate operand + str = str..(" %d"):format(ins.imm) + else + -- Register operand + str = str..(" r%d"):format(ins.src) + end + end + str = str..("\t=> %d"):format(pc + 1 + ins.off) + end + else + -- Return + local mode = which(ins.op, m) + if mode == m.IMM then + str = str.." "..name(mode, m) + str = str..("\t%d"):format(ins.imm) + end + end + return str + end + while pc < ffi.sizeof(insn) / ffi.sizeof(ins) do + print(pc, dis_ins(insn[pc])) + pc = pc + 1 + end +end + +function selftest () + local insns = asm{ + -- r3 = XDP_ABORTED + { op=bor(c.ALU, a.MOV, s.K), dst=3, imm=0 }, + -- r2 = ((struct xdp_md *)ctx)->rx_queue_index + { op=bor(c.LDX, f.W, m.MEM), dst=2, src=1, off=16 }, + -- r1 = xskmap + { op=bor(c.LD, f.DW, m.IMM), dst=1, src=s.MAP_FD, imm=4 }, + { imm=0 }, -- nb: upper 32 bits of 64-bit (DW) immediate + -- r0 = redirect_map(r1, r2, r3) + { op=bor(c.JMP, j.CALL), imm=fn.redirect_map }, + -- EXIT: + { op=bor(c.JMP, j.EXIT) } + } + dis(insns) +end diff --git a/src/apps/xdp/pf_ebpf_codegen.lua b/src/apps/xdp/pf_ebpf_codegen.lua new file mode 100644 index 0000000000..6db10c2465 --- /dev/null +++ b/src/apps/xdp/pf_ebpf_codegen.lua @@ -0,0 +1,373 @@ +-- Use of this source code is governed by the Apache 2.0 license; see COPYING. + +module(...,package.seeall) + +-- This module implements code generation for the XDP/eBPF backend of +-- Pflua. It takes the result of instruction selection (selection.lua) +-- and register allocation (regalloc.lua) and generates a function with +-- eBPF bytecode. + +local parse = require('pf.parse').parse +local expand = require('pf.expand').expand +local optimize = require('pf.optimize').optimize +local anf = require('pf.anf') +local ssa = require('pf.ssa') +local sel = require("pf.selection") +local ra = require("pf.regalloc") +local bpf = require("apps.xdp.bpf") + +local c, f, m, a, s, j = bpf.c, bpf.f, bpf.m, bpf.a, bpf.s, bpf.j + +local tobit, band, bor, rshift = bit.tobit, bit.band, bit.bor, bit.rshift + +-- eBPF register allocation: +-- * mark r1 callee save: holds the xdp_md context we wish to preserve +-- * omit r0: we will keep a pointer to the packet payload in here +-- * omit r2: we will use this register to perform length checks +-- * use r3 as len: we will store data_end here (used in length checks) +local ebpf_regs = { + caller_regs = { 9, 8, 7, 6, 5, 4, 3 }, + callee_regs = { 1 }, + len = 3 +} + +-- Generate a eBPF XDP program that will return XDP_PASS unless filter expr +-- matches, and otherwise "fall-though" as to allow execution of a further eBPF +-- program that is to be appended. +function codegen (ir, alloc) + -- push callee-save registers if we use any + local to_pop = {} + for reg, _ in pairs(alloc.callee_saves) do + error("NYI: callee saves") + -- we need to record the order in which to pop + -- b/c while the push order doesn't matter, the + -- pop order must be reverse (and callee_saves + -- is an unordered set) + table.insert(to_pop, reg) + end + + -- in bytes + local stack_slot_size = 8 + + -- allocate space for all spilled vars + local spilled_space = 0 + for _, _ in pairs(alloc.spills) do + spilled_space = spilled_space + stack_slot_size + end + if spilled_space > 0 then + error("NYI: spilled space") + end + + -- if the length variable got spilled, we need to explicitly initialize + -- the stack slot for it + if alloc.spills["len"] then + error("NYI: spilled length") + end + + local pc, tr = 1, {} + local function emit (ins) + tr[pc] = ins + pc = pc+1 + end + + local label_offset, labels = 2, {} + + local cmp + local function emit_cjmp (cond, target) + assert(cmp, "cjmp needs preceeding cmp") + local jmp = cmp; cmp = nil + jmp.op = bor(c.JMP, cond, jmp.op) + if target == "true-label" then + jmp.off = 0 + elseif target == "false-label" then + jmp.off = 1 + else + jmp.off = label_offset+target + end + emit(jmp) + end + + -- Setup: move data start and end pointers into r0 and r(alloc.len) + -- r0 = ((struct xdp_md *)ctx)->data + emit{ op=bor(c.LDX, f.W, m.MEM), dst=0, src=1, off=0 } + -- r(alloc.len) = ((struct xdp_md *)ctx)->data_end + emit{ op=bor(c.LDX, f.W, m.MEM), dst=alloc.len, src=1, off=4 } + + for idx, instr in ipairs(ir) do + local itype = instr[1] + + --- FIXME: handle spills + + -- the core code generation logic starts here + if itype == "label" then + local lnum = instr[2] + labels[label_offset+lnum] = pc + + elseif itype == "cjmp" then + local op, target = instr[2], instr[3] + + if op == "=" then + emit_cjmp(j.JEQ, target) + elseif op == "!=" then + emit_cjmp(j.JNE, target) + elseif op == ">=" then + emit_cjmp(j.JGE, target) + elseif op == "<=" then + emit_cjmp(j.JLE, target) + elseif op == ">" then + emit_cjmp(j.JGT, target) + elseif op == "<" then + emit_cjmp(j.JLT, target) + end + + elseif itype == "jmp" then + local next_instr = ir[idx+1] + -- if the jump target is immediately after this in the instruction + -- sequence then don't generate the jump + if (type(instr[2]) == "number" and + next_instr[1] == "label" and + next_instr[2] == instr[2]) then + -- don't output anything + else + if instr[2] == "true-label" then + if next_instr[1] ~= "ret-true" then + emit{ op=bor(c.JMP, j.JA), off=0 } + end + elseif instr[2] == "false-label" then + if next_instr[1] ~= "ret-false" then + emit{ op=bor(c.JMP, j.JA), off=1 } + end + else + emit{ op=bor(c.JMP, j.JA), off=label_offset+instr[2] } + end + end + + elseif itype == "cmp" and instr[2] == "len" then + local lhs_reg = alloc.len + local rhs = instr[3] + assert(rhs ~= "len", "NYI: cmp with rhs len") + + -- Perform eBPF friendly length check. + -- mov r2, r0 + emit{ op=bor(c.ALU64, a.MOV, s.X), dst=2, src=0 } + -- add r2, rhs + if type(rhs) == "number" then + emit{ op=bor(c.ALU64, a.ADD, s.K), dst=2, imm=rhs } + else + emit{ op=bor(c.ALU64, a.ADD, s.X), dst=2, src=alloc[rhs] } + end + -- cmp r6, r2 + cmp = { op=s.X, dst=lhs_reg, src=2 } + + elseif itype == "cmp" then + -- the lhs should never be an immediate so this should be non-nil + local lhs_reg = assert(alloc[instr[2]]) + local rhs = instr[3] + assert(rhs ~= "len", "NYI: cmp with rhs len") + + if type(rhs) == "number" then + cmp = { op=s.K, dst=lhs_reg, imm=rhs } + else + local rhs_reg = alloc[rhs] + cmp = { op=s.X, dst=lhs_reg, src=rhs_reg } + end + + elseif itype == "load" then + local target = alloc[instr[2]] + assert(not alloc.spills[instr[2]], "NYI: load spill") + local offset = instr[3] + local bytes = instr[4] + + if type(offset) == "number" then + if bytes == 1 then + emit{ op=bor(c.LDX, f.B, m.MEM), dst=target, off=offset } + elseif bytes == 2 then + emit{ op=bor(c.LDX, f.H, m.MEM), dst=target, off=offset } + else + emit{ op=bor(c.LDX, f.W, m.MEM), dst=target, off=offset } + end + else + local reg = alloc[offset] + assert(not alloc.spills[offset], "NYI: load spill") + + emit{ op=bor(c.ALU64, a.ADD, s.X), dst=reg } + if bytes == 1 then + emit{ op=bor(c.LDX, f.B, m.MEM), dst=target, src=reg } + elseif bytes == 2 then + emit{ op=bor(c.LDX, f.H, m.MEM), dst=target, src=reg } + else + emit{ op=bor(c.LDX, f.W, m.MEM), dst=target, src=reg } + end + emit{ op=bor(c.ALU64, a.SUB, s.X), dst=reg } + end + + elseif itype == "mov" then + local dst = alloc[instr[2]] + assert(not alloc.spills[instr[2]], "NYI: mov spill") + local arg = instr[3] + + if type(arg) == "number" then + emit{ op=bor(c.ALU, a.MOV, s.K), dst=dst, imm=arg } + else + assert(not alloc.spills[arg], "NYI: mov spill") + emit{ op=bor(c.ALU64, a.MOV, s.X), dst=dst, src=alloc[arg] } + end + + elseif itype == "mov64" then + local dst = alloc[instr[2]] + local imm = instr[3] + emit{ op=bor(c.LD, f.DW, m.IMM), dst=dst, src=s.K, imm=tobit(imm) } + emit{ imm=rshift(imm, 32) } + + elseif itype == "add" then + local reg1, reg2 = alloc[instr[2]], alloc[instr[3]] + emit{ op=bor(c.ALU64, a.ADD, s.X), dst=reg1, src=reg2 } + + elseif itype == "sub" then + local reg1, reg2 = alloc[instr[2]], alloc[instr[3]] + emit{ op=bor(c.ALU64, a.SUB, s.X), dst=reg1, src=reg2 } + + elseif itype == "mul" then + local reg1, reg2 = alloc[instr[2]], alloc[instr[3]] + emit{ op=bor(c.ALU64, a.MUL, s.X), dst=reg1, src=reg2 } + + -- For division we use floating point division to avoid having + -- to deal with the %eax register for the div instruction. + elseif itype == "div" then + local reg1, reg2 = alloc[instr[2]], alloc[instr[3]] + emit{ op=bor(c.ALU64, a.DIV, s.X), dst=reg1, src=reg2 } + + elseif itype == "and" then + local reg1, reg2 = alloc[instr[2]], alloc[instr[3]] + emit{ op=bor(c.ALU64, a.AND, s.X), dst=reg1, src=reg2 } + + elseif itype == "or" then + local reg1, reg2 = alloc[instr[2]], alloc[instr[3]] + emit{ op=bor(c.ALU64, a.OR, s.X), dst=reg1, src=reg2 } + + elseif itype == "xor" then + local reg1, reg2 = alloc[instr[2]], alloc[instr[3]] + emit{ op=bor(c.ALU64, a.XOR, s.X), dst=reg1, src=reg2 } + + elseif itype == "shl" then + local reg1, reg2 = alloc[instr[2]], alloc[instr[3]] + emit{ op=bor(c.ALU64, a.LSH, s.X), dst=reg1, src=reg2 } + + elseif itype == "shr" then + local reg1, reg2 = alloc[instr[2]], alloc[instr[3]] + emit{ op=bor(c.ALU64, a.RSH, s.X), dst=reg1, src=reg2 } + + elseif itype == "add-i" then + local reg = alloc[instr[2]] + emit{ op=bor(c.ALU64, a.ADD, s.K), dst=reg, imm=instr[3] } + + elseif itype == "sub-i" then + local reg = alloc[instr[2]] + emit{ op=bor(c.ALU64, a.SUB, s.K), dst=reg, imm=instr[3] } + + elseif itype == "mul-i" then + local r = alloc[instr[2]] + emit{ op=bor(c.ALU64, a.MUL, s.K), dst=reg, imm=instr[3] } + + elseif itype == "and-i" then + local reg = alloc[instr[2]] + assert(type(reg) == "number") + assert(type(instr[3]) == "number") + emit{ op=bor(c.ALU64, a.AND, s.K), dst=reg, imm=instr[3] } + + elseif itype == "or-i" then + local reg = alloc[instr[2]] + assert(type(reg) == "number") + assert(type(instr[3]) == "number") + emit{ op=bor(c.ALU64, a.OR, s.K), dst=reg, imm=instr[3] } + + elseif itype == "xor-i" then + local reg = alloc[instr[2]] + assert(type(reg) == "number") + assert(type(instr[3]) == "number") + emit{ op=bor(c.ALU64, a.XOR, s.K), dst=reg, imm=instr[3] } + + elseif itype == "shl-i" then + local reg = alloc[instr[2]] + emit{ op=bor(c.ALU64, a.LSH, s.K), dst=reg, imm=instr[3] } + + elseif itype == "shr-i" then + local reg = alloc[instr[2]] + emit{ op=bor(c.ALU64, a.RSH, s.K), dst=reg, imm=instr[3] } + + elseif itype == "ntohs" then + local reg = alloc[instr[2]] + emit{ op=bor(c.ALU, a.END, a.BE), dst=reg, imm=16 } + + elseif itype == "ntohl" then + local reg = alloc[instr[2]] + emit{ op=bor(c.ALU, a.END, a.BE), dst=reg, imm=32 } + + elseif itype == "uint32" then + local reg = alloc[instr[2]] + emit{ op=bor(c.ALU, a.AND, s.X), dst=reg, src=reg } + + elseif itype == "ret-true" then + labels[0] = pc + -- In the end, we will turn this into a jump to the first instruction + -- beyond the end of the emitted sequence. + emit{ op=bor(c.JMP, j.JA) } + + elseif itype == "ret-false" then + labels[1] = pc + -- r0 = XDP_PASS + emit{ op=bor(c.ALU, a.MOV, s.K), dst=0, imm=2 } + -- EXIT: + emit{ op=bor(c.JMP, j.EXIT) } + + elseif itype == "nop" then + -- don't output anything + + else + error(string.format("NYI instruction %s", itype)) + end + end + + -- Fixup true-label + local true_label = labels[0] + if true_label == #tr then + -- True-label is last instruction: remove its target instruction + tr[true_label] = nil + elseif true_label then + -- Set the jump offset to the first ins. beyond the emitted sequence + tr[true_label].off = #tr - true_label + end + + -- Fixup jump offsets + for pc, ins in ipairs(tr) do + if band(ins.op, c.JMP) == c.JMP and ins.off then + ins.off = labels[ins.off] - (pc+1) + end + end + + return tr +end + +function compile(filter, dump) + local expr = optimize(expand(parse(filter), "EN10MB")) + local ssa = ssa.convert_ssa(anf.convert_anf(expr)) + local ir = sel.select(ssa) + local alloc = ra.allocate(ir, ebpf_regs) + local code = codegen(ir, alloc) + if dump then + require("core.lib").print_object(alloc) + require("core.lib").print_object(ir) + print(filter) + bpf.dis(bpf.asm(code)) + end + return code +end + +function selftest() + compile("ip proto esp or ip proto 99 or arp", "dump") + compile("ip6[6] = 50 or ip6[6] = 99 or ".. + "(ip6[6] = 58 and (ip6[40] = 135 or ip6[40] = 136))", + "dump") + compile("1 = 2", + "dump") +end diff --git a/src/apps/xdp/test_filter.lua b/src/apps/xdp/test_filter.lua new file mode 100644 index 0000000000..2a89be1c66 --- /dev/null +++ b/src/apps/xdp/test_filter.lua @@ -0,0 +1,14 @@ +module(...,package.seeall) + +local xdp = require("apps.xdp.xdp") + +function selftest () + print("selftest: apps.xdp.test_filter") + local xdpdeva, xdpmaca, xdpdevb, xdpmacb, nqueues = xdp.selftest_init() + if nqueues > 1 then + os.exit(engine.test_skipped_code) + end + print("test: rxtx_match_filter") + xdp.selftest_rxtx_match_filter(xdpdeva, xdpmaca, xdpdevb, xdpmacb) + print("selftest ok") +end \ No newline at end of file diff --git a/src/apps/xdp/test_filter_pass.lua b/src/apps/xdp/test_filter_pass.lua new file mode 100644 index 0000000000..f2b3301497 --- /dev/null +++ b/src/apps/xdp/test_filter_pass.lua @@ -0,0 +1,14 @@ +module(...,package.seeall) + +local xdp = require("apps.xdp.xdp") + +function selftest () + print("selftest: apps.xdp.test_filter_pass") + local xdpdeva, xdpmaca, xdpdevb, xdpmacb, nqueues = xdp.selftest_init() + if nqueues > 1 then + os.exit(engine.test_skipped_code) + end + print("test: rxtx_match_filter_pass") + xdp.selftest_rxtx_match_filter_pass(xdpdeva, xdpmaca, xdpdevb, xdpmacb) + print("selftest ok") +end \ No newline at end of file diff --git a/src/apps/xdp/test_rxtx.lua b/src/apps/xdp/test_rxtx.lua new file mode 100644 index 0000000000..71bbb3739f --- /dev/null +++ b/src/apps/xdp/test_rxtx.lua @@ -0,0 +1,13 @@ +module(...,package.seeall) + +local xdp = require("apps.xdp.xdp") + +function selftest () + print("selftest: apps.xdp.test_rxtx") + local xdpdeva, xdpmaca, xdpdevb, xdpmacb, nqueues = xdp.selftest_init() + print("test: rxtx") + xdp.selftest_rxtx(xdpdeva, xdpmaca, xdpdevb, xdpmacb, nqueues) + print("test: duplex") + xdp.selftest_duplex(xdpdeva, xdpmaca, xdpdevb, xdpmacb, nqueues) + print("selftest ok") +end \ No newline at end of file diff --git a/src/apps/xdp/test_share.lua b/src/apps/xdp/test_share.lua new file mode 100644 index 0000000000..24d00f94ba --- /dev/null +++ b/src/apps/xdp/test_share.lua @@ -0,0 +1,14 @@ +module(...,package.seeall) + +local xdp = require("apps.xdp.xdp") + +function selftest () + print("selftest: apps.xdp.test_share") + local xdpdeva, xdpmaca, xdpdevb, xdpmacb, nqueues = xdp.selftest_init() + if nqueues <= 1 then + os.exit(engine.test_skipped_code) + end + print("test: share_interface") + xdp.selftest_share_interface(xdpdeva, xdpmaca, xdpdevb, xdpmacb, nqueues) + print("selftest ok") +end \ No newline at end of file diff --git a/src/apps/xdp/xdp.lua b/src/apps/xdp/xdp.lua new file mode 100644 index 0000000000..edb0ad0810 --- /dev/null +++ b/src/apps/xdp/xdp.lua @@ -0,0 +1,995 @@ +-- Use of this source code is governed by the Apache 2.0 license; see COPYING. + +module(...,package.seeall) + +local S = require("syscall") +local ffi = require("ffi") +local bpf = require("apps.xdp.bpf") +local pf = require("apps.xdp.pf_ebpf_codegen") +local lib = require("core.lib") +local bits = lib.bits +local band, bor, rshift, tobit = bit.band, bit.bor, bit.rshift, bit.tobit + +-- ---- XDP driver for Snabb -------------------------------------------- + +-- This is a Snabb driver for Linux AF_XDP[1][2] sockets. The XDP kernel +-- interface presents an ABI/API combination similar to what a hardware NIC +-- usually provides: a way to attach to hardware queues, and a set of +-- descriptor rings for each queue used to enqueue and dequeue packet memory +-- buffers. +-- +-- Like with hardware NICs, XDP imposes us with constraints on the kind of +-- memory buffers we can enqueue onto its descriptor rings. Instead of DMA +-- memory required to drive hardware NICs, XDP requires us to register a +-- special kind of memory called UMEM to use with an AF_XDP socket. Only +-- buffers in the UMEM registered with a given socket can be used for I/O with +-- that socket! +-- +-- To consolidate this and other constraints (see "UMEM allocation" below) with +-- Snabb's packet memory architecture this driver allocates a single contiguous +-- memory region used as UMEM for all of the process' AF_XDP sockets, and +-- replaces the memory allocation routine dma_alloc in core.memory with its own +-- UMEM allocator. Hence, the packet freelist will be filled with UMEM memory +-- buffers used for all packet allocations. +-- +-- snabb_enable_xdp() +-- +-- To use the XDP app, "Snabb XDP mode" must be enabled by calling this +-- function. Calling this function replaces Snabb's native memory +-- allocator with the UMEM allocator. +-- +-- The caller must ensure that no packets have been allocated via +-- packet.allocate() prior to calling this function. +-- +-- CAVEATS: +-- +-- * Memory allocated by the UMEM allocator can not be used with DMA +-- drivers: using the XDP app precludes the use of Snabb's native +-- hardware drivers. +-- +-- * Memory allocated by the UMEM allocator can not be shared with +-- other Snabb processes in the same process group: using +-- snabb_enable_xdp precludes the use of Interlink apps +-- (apps.interlink). +-- +-- * UMEM chunks can not be larger than the page size (4096 bytes). +-- This AD_XDP limitation plus the way Snabb implements packet +-- buffer shifting operations limits the effective MTU: the MTU of +-- the XDP app is limited to 3,582 bytes. See XDP:create_xsk(). +-- +-- The only means by which an AF_XDP socket can receive packets from a device +-- is by attaching an eBPF XDP program to the Linux interface. The XDP app +-- assembles a minimal BPF program to route packets from device queues to XDP +-- sockets. See XDP:initialize_xdp. +-- +-- References: +-- [1] https://www.kernel.org/doc/html/v5.3/networking/af_xdp.html +-- [2] The Linux kernel source repository + + +-- ---- UMEM allocation ------------------------------------------------- + +-- Must maintain invariants: chunk size must be <= page size and UMEM must be +-- aligned to page size. + +local page_size = S.getpagesize() +local chunk_size = page_size +local num_chunks = 200000 +local umem_backing, umem, umem_size, umem_used + +-- UMEM allocator: multiple UMEM chunks must be allocated to fit a full packet. +-- However, AF_XDP sockets will only ever see the first of the chunks that make +-- up a packet. The extra (two) UMEM chunks are effectively unused by the +-- socket (but used by Snabb to ensure that packets can actually use +-- packet.max_payload bytes of payload). +-- See core.packet, "XDP rings", XDP:create_xsk(). +local function umem_alloc (size, align) + -- NB: align parameter ignored as we align to chunk_size + assert(align <= chunk_size) + assert(umem_used + size <= umem_size, + "Out of packet buffer memory. Increase num_chunks?") + local chunk = umem + umem_used + umem_used = lib.align(umem_used + size, chunk_size) + return chunk +end + +-- Convert from pointer to relative UMEM offset. +local function to_umem (ptr) + return ffi.cast("uintptr_t", ptr) - ffi.cast("uintptr_t", umem) +end + +-- Convert relative UMEM offset to pointer. +local function from_umem (offset) + return umem + offset +end + +local snabb_xdp_enabled = false +function snabb_enable_xdp (opt) + opt = opt or {} + if opt.num_chunks then + num_chunks = math.ceil(assert(tonumber(opt.num_chunks), + "num_chunks must be a number")) + end + -- Allocate UMEM + umem_size = chunk_size * num_chunks + umem_backing = ffi.new("char[?]", umem_size + page_size) + umem = ffi.cast("char*", lib.align(ffi.cast("uintptr_t", umem_backing), page_size)) + umem_used = 0 + -- Hot-swap core.memory.dma_alloc + require("core.memory").dma_alloc = umem_alloc + snabb_xdp_enabled = true +end + + +-- ---- FFI types ------------------------------------------------------- + +local xdp_umem_reg_t = ffi.typeof[[ + struct { + void * addr; /* Start of packet data area */ + uint64_t len; /* Length of packet data area */ + uint32_t chunk_size; + uint32_t headroom; + uint32_t flags; /* Not available in 4.19 */ + } __attribute__((packed))]] + +local sockaddr_xdp_t = ffi.typeof[[ + struct { + uint16_t family; + uint16_t flags; + uint32_t ifindex; + uint32_t queue_id; + uint32_t shared_umem_fd; + } __attribute__((packed))]] + +local xdp_ring_offset_t = ffi.typeof[[ + struct { + uint64_t producer; + uint64_t consumer; + uint64_t desc; + uint64_t flags; /* Not available in 4.19 */ + } __attribute__((packed))]] + +local xdp_ring_offset_noflags_t = ffi.typeof[[ + struct { + uint64_t producer; + uint64_t consumer; + uint64_t desc; + } __attribute__((packed))]] + +local xdp_mmap_offsets_templ = [[ + struct { + $ rx, + tx, + fr, /* Fill */ + cr; /* Completion */ + } __attribute__((packed))]] +local xdp_mmap_offsets_noflags_t = + ffi.typeof(xdp_mmap_offsets_templ, xdp_ring_offset_noflags_t) +local xdp_mmap_offsets_t = + ffi.typeof(xdp_mmap_offsets_templ, xdp_ring_offset_t) + +local xdp_ring_t = ffi.typeof[[ + struct { + char *map; + size_t maplen; + uint32_t *producer, *consumer, *flags; + void *desc; + uint32_t write, read; + }]] + +local xdp_desc_t = ffi.typeof[[ + struct { + uint64_t addr; + uint32_t len; + uint32_t options; + } __attribute__((packed))]] +local xdp_desc_ptr_t = ffi.typeof("$ *", xdp_desc_t) + +local netlink_set_link_xdp_request_t = ffi.typeof[[ + struct { + struct { /* nlmsghdr */ + uint32_t nlmsg_len; /* Length of message including header */ + uint16_t nlmsg_type; /* Message content */ + uint16_t nlmsg_flags; /* Additional flags */ + uint32_t nlmsg_seq; /* Sequence number */ + uint32_t nlmsg_pid; /* Sending process port ID */ + } nh; + struct { /* ifinfomsg */ + unsigned char ifi_family; + unsigned char __ifi_pad; + unsigned short ifi_type; /* ARPHRD_* */ + int ifi_index; /* Link index */ + unsigned ifi_flags; /* IFF_* flags */ + unsigned ifi_change; /* IFF_* change mask */ + } ifinfo; + struct { /* nlattr */ + uint16_t nla_len; + uint16_t nla_type; + } xdp; + struct { /* nlattr */ + uint16_t nla_len; + uint16_t nla_type; + int32_t fd; + } xdp_fd; + }__attribute__((packed))]] + + +-- ---- XDP rings ------------------------------------------------------- + +-- Ring operations for the single-producer single-consumer rings used for I/O +-- with AF_XDP sockets (xdp_ring_t). This is is a blend between an +-- "Array + two unmasked indices"[1] and MCRingBuffer[2] implementation. +-- +-- Only the "Array + two unmasked indices" half of the implementation is +-- actually exposed by the kernel via the pointers to shared consumer/producer +-- fields (see xdp_ring_t, XDP:xdp_map_ring()). The MCRingBuffer portion is +-- added by userspace (us) to optimize our CPU cache footprint. +-- +-- Each AF_XDP socket has two rings (rx, tx) and each UMEM has two rings +-- (fr - fill ring, cr - completion ring). This XDP driver registers a new UMEM +-- for each socket so that each socket effectively has four rings +-- (rx, tx, fr, cr). +-- +-- For the Linux kernel to be able to fill the rx ring we need to provide it +-- UMEM chunks via the fill ring (fr). Chunks used by us to send packets via +-- the tx ring are returned by the kernel back to the userspace application via +-- the completion ring (cr). +-- +-- It is important to note that XDP rings operate on chunks: the addr field +-- of xdp_desc_t points *into* a chunk, and its len field is, from the kernel’s +-- perspective, bounded to the end of that chunk. See "UMEM allocation" and +-- XDP:create_xsk() for how this affects Snabb. +-- +-- NB: Snabb packet payloads are preceded by a two byte length field, so we +-- have to account for this overhead when retrieving packets from XDP +-- descriptor rings. See receive(r) below and XDP:create_xsk(). +-- +-- References: +-- [1] https://www.snellman.net/blog/archive/2016-12-13-ring-buffers/ +-- [2] https://www.cse.cuhk.edu.hk/~pclee/www/pubs/ancs09poster.pdf + +local xdp_ring_ndesc = 2048 -- Number of descriptors in ring. + +local function mask (i) return band(i, xdp_ring_ndesc - 1) end +local function inc (i) return tobit(i + 1) end +local function full1 (r, w) return tobit(w - r) == xdp_ring_ndesc end + +function full (r) + if full1(r.read, r.write) then + if full1(r.consumer[0], r.write) then + return true + end + r.read = r.consumer[0] + end +end + +function transmit (r, p) + local desc = ffi.cast(xdp_desc_ptr_t, r.desc) + local idx = mask(r.write) + desc[idx].addr = to_umem(p.data) + desc[idx].len = p.length + r.write = inc(r.write) +end + +function fill (r, p) + local desc = ffi.cast("uint64_t *", r.desc) + local idx = mask(r.write) + desc[idx] = to_umem(p) + r.write = inc(r.write) +end + +function push (r) + -- NB: no need for memory barrier on x86 because of TSO. + r.producer[0] = r.write +end + +function empty (r) + if r.read == r.write then + if r.read == r.producer[0] then + return true + end + r.write = r.producer[0] + end +end + +local packet_overhead = 2 -- leading struct packet length field (uint16_t) +function receive (r) + local desc = ffi.cast(xdp_desc_ptr_t, r.desc) + local idx = mask(r.read) + local p = ffi.cast("struct packet *", + -- packet struct begins at payload - packet_overhead + from_umem(desc[idx].addr) - packet_overhead) + p.length = desc[idx].len + r.read = inc(r.read) + return p +end + +function reclaim (r) + -- NB: reclaim does not (re)set the payload length field. + -- Reclaimed packets do *not* have known payload lengths! + local desc = ffi.cast("uint64_t *", r.desc) + local idx = mask(r.read) + local p = ffi.cast("struct packet *", from_umem(desc[idx])) + r.read = inc(r.read) + return p +end + +function pull (r) + -- NB: no need for memory barrier on x86 (see push.) + r.consumer[0] = r.read +end + +function needs_wakeup (r) + -- NB: Unavailable when kernel does not support ring flags. + -- See: XDP.kernel_has_ring_flags, XDP:create_xsk(), XDP:kick() + return band(r.flags[0], bits{XDP_RING_NEED_WAKEUP=1}) +end + +-- Rewind routines for transmit/fill. These are used by XDP:stop() to reclaim +-- packet buffers left in-fight after shutdown. + +function rewind_transmit (r) + r.write = tobit(r.write - 1) + local desc = ffi.cast(xdp_desc_ptr_t, r.desc) + local idx = mask(r.write) + return ffi.cast("struct packet *", + -- packet struct begins at payload - packet_overhead + from_umem(desc[idx].addr) - packet_overhead) +end + +function rewind_fill (r) + r.write = tobit(r.write - 1) + local desc = ffi.cast("uint64_t *", r.desc) + local idx = mask(r.write) + return ffi.cast("struct packet *", from_umem(desc[idx])) +end + + +-- ---- XDP App --------------------------------------------------------- + +XDP = { + config = { + ifname = {required=true}, -- interface name + filter = {}, -- interface pcap-filter(7) (optional) + queue = {default=0} -- interface queue (zero based) + }, + -- Class variables: + kernel_has_ring_flags = true -- feature detection status for descriptor ring flags +} + +-- The `driver' variable is used as a reference to the driver class in +-- order to interchangeably use NIC drivers. +driver = XDP + +-- Class methods + +function XDP:new (conf) + assert(snabb_xdp_enabled, "Snabb XDP mode must be enabled.") + -- Ensure interface is initialized for XDP usage. + local lockfd, mapfd = self:open_interface(conf.ifname, conf.filter) + -- Create XDP socket (xsk) for queue. + local xsk = self:create_xsk(conf.ifname, lockfd, conf.queue) + -- Attach the socket to queue in the BPF map. + self:set_queue_socket(mapfd, conf.queue, xsk) + mapfd:close() -- not longer needed + -- Finish initialization. + return setmetatable(xsk, {__index=XDP}) +end + +function XDP:open_interface (ifname, filter) + -- Open an interface-dependent file we know should exist to use as a + -- Snabb-wide lock. The contents of the file are really irrelevant here. + -- However, we depend on the file not being locked by other applications in + -- general. :-) + local lockfd = S.open("/sys/class/net/"..ifname.."/operstate", "rdonly") + local mapfd, progfd + local xskmap_path = "/sys/fs/bpf/snabb/"..ifname.."/xskmap" + local prog_path = "/sys/fs/bpf/snabb/"..ifname.."/xdp" + -- If the open above failed we assume that no device by ifname exists. + assert(lockfd, "Could not open interface: "..ifname.." (does it exist?)") + if lockfd:flock("ex, nb") then + -- If we get an exclusive lock we know that no other Snabb processes are + -- using the interface so its safe to setup the interface and replace any + -- existsing BPF XDP program/maps attached to it. + S.mkdir("/sys/fs/bpf/snabb", "rwxu, rgrp, xgrp, roth, xoth") + S.util.rm("/sys/fs/bpf/snabb/"..ifname) + S.mkdir("/sys/fs/bpf/snabb/"..ifname, "rwxu, rgrp, xgrp, roth, xoth") + -- Create xskmap and XDP program to run on the NIC. + mapfd = self:create_xskmap() + progfd = self:xdp_prog(mapfd, filter) + self:set_link_xdp(ifname, progfd) + -- Pin xskmap so it can be accessed by other Snabb processes to attach to + -- the interface. Also pin the XDP program, just 'cause. + assert(S.bpf_obj_pin(xskmap_path, mapfd)) + assert(S.bpf_obj_pin(prog_path, progfd)) + progfd:close() -- no longer needed + lockfd:flock("sh") -- share lock + else + lockfd:flock("sh") + -- Wait for the lock to be shared: once it is no longer held exclusively + -- we know that the interface is setup and ready to use. + -- Get the currently pinned xskmap to insert our XDP socket into. + mapfd = assert(S.bpf_obj_get(xskmap_path)) + end + -- lockfd: holds a shared lock for as long as we do not close it, signaling + -- other Snabb processes that the interface is in use. + -- mapfd: the xskmap for the interface used to + -- attach XDP sockets to queues. + return lockfd, mapfd +end + +function XDP:create_xskmap () + local klen, vlen = ffi.sizeof("int"), ffi.sizeof("int") + local nentries = 128 + local map, err + for _ = 1,7 do + -- Try to create BPF map. + map, err = S.bpf_map_create('xskmap', klen, vlen, nentries) + -- Return map on success. + if map then return map end + -- Failed to create map, increase MEMLOCK limit and retry. + -- See https://github.com/xdp-project/xdp-tutorial/issues/63 + local lim = assert(S.getrlimit('memlock')) + assert(S.setrlimit('memlock', {cur=lim.cur*2, max=lim.max*2})) + end + -- Exceeded retries, bail. + error("Failed to create BPF map: "..tostring(err)) +end + +function XDP:xdp_prog (xskmap, filter) + -- Assemble and load XDP BPF program. + -- If we have a filter argument, compile a filter that passes non-matching + -- packets on to the kernel networking stack (XDP_PASS). Append to it our + -- regular XSK forwarding code (XDP:xdp_forward) so packets that pass + -- the filter are forwarded to attached XDP sockets. + local flt = (filter and pf.compile(filter)) or {} + for _, ins in ipairs(self:xdp_forward(xskmap)) do + -- Append forwarding logic to filter. + table.insert(flt, ins) + end + local asm = bpf.asm(flt) + local prog, err, log = S.bpf_prog_load( + 'xdp', asm, ffi.sizeof(asm) / ffi.sizeof(bpf.ins), "Apache 2.0" + ) + if prog then + return prog + else + error(tostring(err).."\n"..log) + end +end + +function XDP:xdp_forward (xskmap) + local c, f, m, a, s, j, fn = + bpf.c, bpf.f, bpf.m, bpf.a, bpf.s, bpf.j, bpf.fn + -- The program below looks up the incoming packet's queue index in xskmap to + -- find the corresponding XDP socket (xsk) to deliver the packet to. + return { + -- r3 = XDP_ABORTED + { op=bor(c.ALU, a.MOV, s.K), dst=3, imm=0 }, + -- r2 = ((struct xdp_md *)ctx)->rx_queue_index + { op=bor(c.LDX, f.W, m.MEM), dst=2, src=1, off=16 }, + -- r1 = xskmap + { op=bor(c.LD, f.DW, m.IMM), dst=1, src=s.MAP_FD, imm=xskmap:getfd() }, + { imm=0 }, -- nb: upper 32 bits of 64-bit (DW) immediate + -- r0 = redirect_map(r1, r2, r3) + { op=bor(c.JMP, j.CALL), imm=fn.redirect_map }, + -- EXIT: + { op=bor(c.JMP, j.EXIT) } + } +end + +function XDP:set_link_xdp (ifname, prog) + -- Open a NETLINK socket, and transmit command that attaches XDP program + -- prog to link by ifname. + local netlink = assert(S.socket('netlink', 'raw', 'route')) + local SOL_NETLINK = 270 + local NETLINK_EXT_ACK = 11 + local ext_ack_on = ffi.new("int[1]", 1) + assert(S.setsockopt(netlink, SOL_NETLINK, NETLINK_EXT_ACK, + ext_ack_on, ffi.sizeof(ext_ack_on))) + local IFLA_XDP = 43 + local IFLA_XDP_FD = 1 + local IFLA_XDP_FLAGS = 3 + local request = ffi.new( + netlink_set_link_xdp_request_t, + { nh = { nlmsg_flags = bor(S.c.NLM_F.REQUEST, S.c.NLM_F.ACK), + nlmsg_type = S.c.RTM.SETLINK }, + ifinfo = { ifi_family = S.c.AF.UNSPEC, + ifi_index = S.util.if_nametoindex(ifname) }, + xdp = { nla_type = bor(bits{ NLA_F_NESTED=15 }, IFLA_XDP) }, + xdp_fd = { nla_type = IFLA_XDP_FD, + fd = prog:getfd() } } + ) + request.nh.nlmsg_len = ffi.sizeof(request) + request.xdp.nla_len = ffi.sizeof(request.xdp) + ffi.sizeof(request.xdp_fd) + request.xdp_fd.nla_len = ffi.sizeof(request.xdp_fd) + assert(netlink:send(request, ffi.sizeof(request))) + local response = assert(S.nl.read(netlink, nil, nil, true)) + if response.error then + error("NETLINK responded with error: "..tostring(response.error)) + end + netlink:close() +end + +function XDP:create_xsk (ifname, lockfd, queue) + local xsk = { sock = assert(S.socket('xdp', 'raw')), lockfd = lockfd } + -- Register UMEM. + local umem_reg = ffi.new( + xdp_umem_reg_t, + { addr = umem, + len = umem_size, + -- The chunk size is equal to the page size (4096 bytes, see + -- "UMEM allocation"), and XDP packet descriptors point to individual + -- chunks (see "XDP rings"). Hence, the MTU of AF_XDP sockets is + -- limited to the page size, and the effective MTU of the XDP app is + -- further limited by the way core.packet implements packet shifting + -- operations (see headroom below). The effective MTU is calculated as + -- 4096 - packet.packet_alignment (512) - packet_overhead (2) = 3582 + chunk_size = chunk_size, + -- By configuring the headroom according to core.packet we make sure + -- that XDP leaves enough headroom for the preceeding length field of + -- Snabb's struct packet as well as headroom for packet shifting + -- operations. + headroom = packet.default_headroom + packet_overhead, + -- flags = bits{ XDP_UMEM_UNALIGNED_CHUNK_FLAG=1 } + } + ) + assert(xsk.sock:setsockopt('xdp', 'xdp_umem_reg', umem_reg, ffi.sizeof(umem_reg))) + -- Configure XDP rings and map them into this process’ memory. + local ndesc = ffi.new("int[1]", xdp_ring_ndesc) + assert(xsk.sock:setsockopt('xdp', 'xdp_rx_ring', ndesc, ffi.sizeof(ndesc))) + assert(xsk.sock:setsockopt('xdp', 'xdp_tx_ring', ndesc, ffi.sizeof(ndesc))) + assert(xsk.sock:setsockopt('xdp', 'xdp_umem_fill_ring', ndesc, ffi.sizeof(ndesc))) + assert(xsk.sock:setsockopt('xdp', 'xdp_umem_completion_ring', ndesc, ffi.sizeof(ndesc))) + local layouts = ffi.new(xdp_mmap_offsets_t) + if not pcall(S.getsockopt, xsk.sock, 'xdp', 'xdp_mmap_offsets', layouts, ffi.sizeof(layouts)) then + -- Kernel appears not to support XDP ring flags field. Disable feature, + -- and retry with xdp_mmap_offsets_noflags_t. + self.kernel_has_ring_flags = false + layouts = ffi.new(xdp_mmap_offsets_noflags_t) + assert(xsk.sock:getsockopt('xdp', 'xdp_mmap_offsets', layouts, ffi.sizeof(layouts))) + end + xsk.rx = self:xdp_map_ring(xsk.sock, layouts.rx, xdp_desc_t, 0x000000000ULL) -- XDP_PGOFF_RX_RING + xsk.tx = self:xdp_map_ring(xsk.sock, layouts.tx, xdp_desc_t, 0x080000000ULL) -- XDP_PGOFF_TX_RING + -- NB: fill and completion rings do not carry full descriptors, only + -- relative UMEM offsets (addr). + xsk.fr = self:xdp_map_ring(xsk.sock, layouts.fr, "uint64_t", 0x100000000ULL) -- XDP_UMEM_PGOFF_FILL_RING + xsk.cr = self:xdp_map_ring(xsk.sock, layouts.cr, "uint64_t", 0x180000000ULL) -- XDP_UMEM_PGOFF_COMPLETION_RING + -- Counters to track packets in-flight through kernel. + -- - rxq is incremented when a packet buffer is enqueued onto the + -- fill ring and decremented when a packet buffer is dequeued from the + -- tx ring. I.e., it tracks the number of unused buffers currently left + -- on the fill ring. + -- - txq is incremented when a packet buffer is enqueued onto the tx ring + -- and decremented then a packet buffer is dequeued from the + -- completion ring. I.e, it tracks number of unused buffers currently + -- left on the tx ring. + -- The rxq and txq tallies are used by XDP:stop() to perform a clean + -- socket shutdown without leaking packet buffers. + xsk.rxq = 0 + xsk.txq = 0 + -- Bind socket to interface + local sa = ffi.new( + sockaddr_xdp_t, + { family = S.c.AF.XDP, + ifindex = S.util.if_nametoindex(ifname), + queue_id = queue, + -- flags = bits{ XDP_ZEROCOPY=2 } + } + ) + local ok, err = xsk.sock:bind(sa, ffi.sizeof(sa)) + if not ok then + error(("Unable to bind AF_XDP socket to %s queue %d (%s)") + :format(ifname, queue, err)) + end + return xsk +end + +-- Map an XDP socket ring into this process’ memory. +function XDP:xdp_map_ring (socket, layout, desc_t, offset) + local prot = "read, write" + local flags = "shared, populate" + local r = ffi.new(xdp_ring_t) + r.maplen = layout.desc + xdp_ring_ndesc * ffi.sizeof(desc_t) + r.map = assert(S.mmap(nil, r.maplen, prot, flags, socket, offset)) + r.producer = ffi.cast("uint32_t *", r.map + layout.producer) + r.consumer = ffi.cast("uint32_t *", r.map + layout.consumer) + if self.kernel_has_ring_flags then + r.flags = ffi.cast("uint32_t *", r.map + layout.flags) + end + r.desc = r.map + layout.desc + return r +end + +function XDP:set_queue_socket(xskmap, queue, xsk) + assert(S.bpf_map_op('map_update_elem', xskmap, + ffi.new("int[1]", queue), + ffi.new("int[1]", xsk.sock:getfd()))) +end + +-- Instance methods + +function XDP:stop () + -- XXX - previous shutdown sequence was broken (see git history for details.) + error("Can not stop XDP driver (operation not supported)") +end + +function XDP:pull () + local output = self.output.output + local rx = self.rx + self:refill() + if not output then return end + for _ = 1, engine.pull_npackets do + if empty(rx) then break end + link.transmit(output, receive(rx)) + self.rxq = self.rxq - 1 + end + pull(rx) +end + +function XDP:push () + local input = self.input.input + local tx = self.tx + if not input then return end + while not link.empty(input) and not full(tx) do + local p = link.receive(input) + transmit(tx, p) + self.txq = self.txq + 1 + -- Stimulate breathing: after the kernel is done with the packet buffer + -- it will either be fed back from the completion ring onto the free + -- ring, or put back onto the freelist via packet.free_internal; hence, + -- account statistics for freed packet here in order to signal to the + -- engine that throughput is happening. + packet.account_free(p) + end + push(tx) + if self.kernel_has_ring_flags then + if needs_wakeup(tx) then self:kick() end + else + if not empty(tx) then self:kick() end + end +end + +function XDP:refill () + local input, output = self.input.input, self.output.output + local fr, cr = self.fr, self.cr + -- If the queue operates in duplex mode (i.e., has both input and output + -- links attached) we feed packet buffers from the completion ring back onto + -- the fill ring. + if input and output then + while not (empty(cr) or full(fr)) do + fill(fr, reclaim(cr)) + self.txq = self.txq - 1 + self.rxq = self.rxq + 1 + end + end + -- If the queue has its output attached we make sure that the kernel does + -- not run out of packet buffers to fill the rx ring with by keeping the + -- fill ring topped up with fresh packets. + -- (If no input is attached, the completion ring is not used, and + -- all packet buffers for rx will be allocated here.) + if output then + while not full(fr) do + fill(fr, packet.allocate()) + self.rxq = self.rxq + 1 + end + end + -- If the queue has its input attached we release any packet buffers + -- remaining in the completion ring back to the packet freelist. + -- (If not output is attached, the fill ring is not used, and + -- all packet buffers used for tx will be reclaimed here.) + if input then + while not empty(cr) do + -- NB: mandatory free_internal since we do not know the payload length + -- of reclaimed packets. + packet.free_internal(reclaim(cr)) + self.txq = self.txq - 1 + end + end + push(fr) + pull(cr) +end + +function XDP:kick () + -- Wake up Linux kernel to process tx ring packets. + self.sock:sendto(nil, 0, 'dontwait', nil, 0) +end + + +-- ---- Tests ----------------------------------------------------------- + +-- Useful setup commands: +-- $ echo 0000:01:00.0 > /sys/bus/pci/drivers/ixgbe/bind +-- $ ip link set ens1f0 addr 02:00:00:00:00:00 +-- $ ethtool --set-channels ens1f0 combined 1 + +function selftest_init () + local xdpdeva = lib.getenv("SNABB_XDP0") + local xdpmaca = lib.getenv("SNABB_XDP_MAC0") + local xdpdevb = lib.getenv("SNABB_XDP1") + local xdpmacb = lib.getenv("SNABB_XDP_MAC1") + local nqueues = tonumber(lib.getenv("SNABB_XDP_NQUEUES")) or 1 + if not (xdpdeva and xdpmaca and xdpdevb and xdpmacb) then + print("SNABB_XDP0 and SNABB_XDP1 must be set. Skipping selftest.") + os.exit(engine.test_skipped_code) + end + snabb_enable_xdp() + engine.report_load() + return xdpdeva, xdpmaca, xdpdevb, xdpmacb, nqueues +end + + +function selftest () + print("selftest: apps.xdp.xdp") + local xdpdeva, xdpmaca, xdpdevb, xdpmacb, nqueues = selftest_init() + if nqueues > 1 then + os.exit(engine.test_skipped_code) + end + print("test: rxtx_match") + selftest_rxtx_match(xdpdeva, xdpmaca, xdpdevb, xdpmacb) + -- NB: see also test_*.lua + print("selftest ok") +end + +local function random_v4_packets (conf) + local ethernet = require("lib.protocol.ethernet") + local ipv4 = require("lib.protocol.ipv4") + local eth = ethernet:new{src = ethernet:pton(conf.src), + dst = ethernet:pton(conf.dst), + type = 0x0800} + local packets = {} + for _, size in ipairs(conf.sizes) do + for _=1,100 do + local ip = ipv4:new{src=lib.random_bytes(4), + dst=lib.random_bytes(4)} + if conf.protocol then ip:protocol(conf.protocol) end + ip:total_length(size - eth:sizeof()) + local payload_length = ip:total_length() - ip:sizeof() + local p = packet.allocate() + packet.append(p, eth:header(), eth:sizeof()) + packet.append(p, ip:header(), ip:sizeof()) + packet.append(p, lib.random_bytes(payload_length), payload_length) + table.insert(packets, p) + end + end + return packets +end + +function selftest_rxtx (xdpdeva, xdpmaca, xdpdevb, xdpmacb, nqueues) + local c = config.new() + local basic = require("apps.basic.basic_apps") + local synth = require("apps.test.synth") + config.app(c, "source", synth.Synth, { + packets = random_v4_packets{ + sizes = {60}, + src = xdpmaca, + dst = xdpmacb + }}) + config.app(c, "sink", basic.Sink) + for queue = 0, nqueues-1 do + local queue_a = xdpdeva.."_q"..queue + local queue_b = xdpdevb.."_q"..queue + config.app(c, queue_a, XDP, { + ifname = xdpdeva, + queue = queue + }) + config.app(c, queue_b, XDP, { + ifname = xdpdevb, + queue = queue + }) + config.link(c, "source.output"..queue.." -> "..queue_a..".input") + config.link(c, queue_b..".output -> sink.input"..queue) + end + engine.configure(c) + print("kernel_has_ring_flags", XDP.kernel_has_ring_flags) + engine.main{ duration=1 } + engine.report_links() + local txtotal, rxtotal = 0, 0 + for queue = 0, nqueues-1 do + local tx = link.stats(engine.app_table.source.output["output"..queue]) + local rx = link.stats(engine.app_table.sink.input["input"..queue]) + assert(tx.rxpackets > 0, "No packets sent on queue: "..queue) + assert(rx.rxpackets > 0, "No packets received on queue: "..queue) + txtotal = txtotal + tx.rxpackets + rxtotal = rxtotal + rx.rxpackets + end + assert(math.abs(txtotal - rxtotal) <= txtotal*.10, -- 10% tolerance + "Too little packets received") +end + +function selftest_duplex (xdpdeva, xdpmaca, xdpdevb, xdpmacb, nqueues) + local c = config.new() + local basic = require("apps.basic.basic_apps") + local synth = require("apps.test.synth") + config.app(c, "source_a", synth.Synth, { + packets = random_v4_packets{ + sizes = {60}, + src = xdpmaca, + dst = xdpmacb + }}) + config.app(c, "source_b", synth.Synth, { + packets = random_v4_packets{ + sizes = {60}, + src = xdpmacb, + dst = xdpmaca + }}) + config.app(c, "sink", basic.Sink) + for queue = 0, nqueues-1 do + local queue_a = xdpdeva.."_q"..queue + local queue_b = xdpdevb.."_q"..queue + config.app(c, queue_a, XDP, { + ifname = xdpdeva, + queue = queue + }) + config.app(c, queue_b, XDP, { + ifname = xdpdevb, + queue = queue + }) + config.link(c, "source_a.output"..queue.." -> "..queue_a..".input") + config.link(c, "source_b.output"..queue.." -> "..queue_b..".input") + config.link(c, queue_a..".output -> sink.input_a"..queue) + config.link(c, queue_b..".output -> sink.input_b"..queue) + end + engine.configure(c) + print("kernel_has_ring_flags", XDP.kernel_has_ring_flags) + engine.main{ duration=1 } + engine.report_links() + for label, stream in ipairs{ + ['a->b'] = {'a','b'}, + ['b->a'] = {'b','a'} + } do + local txtotal, rxtotal = 0, 0 + for queue = 0, nqueues-1 do + local tx = link.stats(engine.app_table["source_"..stream[0]].output["output_"..queue]) + local rx = link.stats(engine.app_table.sink.input["input_"..stream[1]..queue]) + assert(tx.rxpackets > 0, "["..label"..] No packets sent on queue: "..queue) + assert(rx.rxpackets > 0, "["..label"..] No packets received on queue: "..queue) + txtotal = txtotal + tx.rxpackets + rxtotal = rxtotal + rx.rxpackets + end + assert(math.abs(txtotal - rxtotal) <= txtotal*.10, -- 10% tolerance + "["..label"..] Too little packets received") + end +end + +function selftest_rxtx_match (xdpdeva, xdpmaca, xdpdevb, xdpmacb) + local c = config.new() + local synth = require("apps.test.synth") + local npackets = require("apps.test.npackets") + local match = require("apps.test.match") + config.app(c, "source", synth.Synth, { + sizes = {60,64,67,128,133,192,256,384,512,777,1024,1500,1501}, + src = xdpmaca, + dst = xdpmacb, + random_payload = true + }) + config.app(c, "npackets", npackets.Npackets, {npackets=1000}) + config.app(c, "match", match.Match) + config.app(c, xdpdeva.."_q0", XDP, {ifname=xdpdeva}) + config.app(c, xdpdevb.."_q0", XDP, {ifname=xdpdevb}) + config.link(c, "source.output -> "..xdpdeva.."_q0.input") + config.link(c, xdpdevb.."_q0.output -> match.rx") + config.link(c, "source.copy -> npackets.input") + config.link(c, "npackets.output -> match.comparator") + engine.configure(c) + engine.main{ duration=.1 } + engine.report_links() + engine.report_apps() + assert(#engine.app_table.match:errors() == 0, "Match errors.") +end + +function selftest_rxtx_match_filter (xdpdeva, xdpmaca, xdpdevb, xdpmacb) + local c = config.new() + local synth = require("apps.test.synth") + local npackets = require("apps.test.npackets") + local match = require("apps.test.match") + config.app(c, "source", synth.Synth, { + packets = random_v4_packets{ + sizes = {60,64,67,128,133,192,256,384,512,777,1024,1500,1501}, + src = xdpmaca, + dst = xdpmacb, + protocol = 42 + }}) + config.app(c, "npackets", npackets.Npackets, {npackets=1000}) + config.app(c, "match", match.Match) + config.app(c, xdpdeva, XDP, {ifname=xdpdeva}) + config.app(c, xdpdevb, XDP, {ifname=xdpdevb, filter="ip proto 42"}) + config.link(c, "source.output -> "..xdpdeva..".input") + config.link(c, xdpdevb..".output -> match.rx") + config.link(c, "source.copy -> npackets.input") + config.link(c, "npackets.output -> match.comparator") + -- Test redirect + engine.configure(c) + engine.main{ duration=.1 } + engine.report_links() + engine.report_apps() + assert(#engine.app_table.match:errors() == 0, "Match errors.") +end + +function selftest_rxtx_match_filter_pass (xdpdeva, xdpmaca, xdpdevb, xdpmacb) + local c = config.new() + local synth = require("apps.test.synth") + local npackets = require("apps.test.npackets") + local match = require("apps.test.match") + config.app(c, "source", synth.Synth, { + packets = random_v4_packets{ + sizes = {60,64,67,128,133,192,256,384,512,777,1024,1500,1501}, + src = xdpmaca, + dst = xdpmacb, + protocol = 42 + }}) + config.app(c, "npackets", npackets.Npackets, {npackets=1000}) + config.app(c, "match", match.Match) + config.app(c, xdpdeva, XDP, {ifname=xdpdeva}) + config.app(c, xdpdevb, XDP, {ifname=xdpdevb, filter="ip proto 42"}) + config.link(c, "source.output -> "..xdpdeva..".input") + config.link(c, xdpdevb..".output -> match.rx") + config.link(c, "source.copy -> npackets.input") + config.link(c, "npackets.output -> match.comparator") + -- Test pass + config.app(c, xdpdevb, XDP, {ifname=xdpdevb, filter="ip6 proto 77"}) + engine.configure(c) + engine.main{ duration=.1 } + engine.report_links() + assert(#engine.app_table.match:errors() == 1000, "Matched packets.") + assert(link.stats(engine.app_table[xdpdevb].output.output).rxpackets == 0, + "Too many packets received on "..xdpdevb) +end + +function selftest_share_interface_worker (xdpdev, queue) + snabb_enable_xdp() + local c = config.new() + local basic = require("apps.basic.basic_apps") + local recv = xdpdev.."_q"..queue + config.app(c, recv, XDP, { + ifname = xdpdev, + queue = queue + }) + config.app(c, "sink", basic.Sink) + config.link(c, recv..".output -> sink.input") + engine.configure(c) + engine.main{ duration=.1, no_report = true } + print("[worker links]") + engine.report_links() + assert(link.stats(engine.app_table.sink.input.input).rxpackets > 0, + "No packets received on "..recv.." in worker.") +end + +function selftest_share_interface (xdpdeva, xdpmaca, xdpdevb, xdpmacb, nqueues) + local c = config.new() + local worker = require("core.worker") + local basic = require("apps.basic.basic_apps") + local synth = require("apps.test.synth") + config.app(c, "source", synth.Synth, { + packets = random_v4_packets{ + sizes = {60}, + src = xdpmaca, + dst = xdpmacb + }}) + config.app(c, "sink", basic.Sink) + for queue = 0, nqueues-2 do + local queue_a = xdpdeva.."_q"..queue + local queue_b = xdpdevb.."_q"..queue + config.app(c, queue_a, XDP, { + ifname = xdpdeva, + queue = queue + }) + config.app(c, queue_b, XDP, { + ifname = xdpdevb, + queue = queue + }) + config.link(c, "source.output"..queue.." -> "..queue_a..".input") + config.link(c, queue_b..".output -> sink.input"..queue) + end + engine.configure(c) + worker.start('worker', ("require('apps.xdp.xdp').selftest_share_interface_worker('%s', %d)") + :format(xdpdevb, nqueues-1)) + engine.main{ done=function () return not worker.status().worker.alive end, + no_report = true } + local worker_status = worker.status().worker.status + print("[parent links]") + engine.report_links() + if worker_status ~= 0 then + os.exit(worker_status) + end +end diff --git a/src/arch/checksum.dasl b/src/arch/checksum.dasl index a188f75658..38fe29a499 100644 --- a/src/arch/checksum.dasl +++ b/src/arch/checksum.dasl @@ -71,6 +71,7 @@ local function gen_checksum () | jl >5 -- Jump to branch '3'. | mov r9d, dword [rdi + r8] -- Fetch 32-bit from data + r8 into r9d. | add rax, r9 -- Sum acc with r9. Accumulate carry. + | adc rax, 0 -- Sum carry-bit into acc. | sub rcx, 4 -- Decrease index by 4. | add r8, 4 -- Next 32-bit. | 5: @@ -78,6 +79,7 @@ local function gen_checksum () | jl >6 -- Jump to branch '4'. | movzx r9, word [rdi + r8] -- Fetch 16-bit from data + r8 into r9. | add rax, r9 -- Sum acc with r9. Accumulate carry. + | adc rax, 0 -- Sum carry-bit into acc. | sub rcx, 2 -- Decrease index by 2. | add r8, 2 -- Next 16-bit. | 6: @@ -85,6 +87,7 @@ local function gen_checksum () | jl >7 -- Jump to branch '5'. | movzx r9, byte [rdi + r8] -- Fetch 8-bit from data + r8 into r9. | add rax, r9 -- Sum acc with r9. Accumulate carry. + | adc rax, 0 -- Sum carry-bit into acc. -- Fold 64-bit into 16-bit. | 7: | mov r9, rax -- Assign acc to r9. @@ -157,4 +160,11 @@ function selftest () assert(hex(checksum(pkt.data, pkt.length, 0)) == hex(ntohs(checksum_lua(pkt.data, pkt.length)))) assert(hex(checksum(pkt.data, pkt.length, 0)) == hex(C.cksum_generic(pkt.data, pkt.length, 0))) end + -- Test carry propagation + for l = 1, 63 do + local pkt = { data = ffi.new("uint8_t[?]", l), length = l } + for i = 0, l-2 do pkt.data[i]=0xff end; pkt.data[l-1] = 0x01 + assert(hex(checksum(pkt.data, pkt.length, 0)) == hex(ntohs(checksum_lua(pkt.data, pkt.length)))) + assert(hex(checksum(pkt.data, pkt.length, 0)) == hex(C.cksum_generic(pkt.data, pkt.length, 0))) + end end diff --git a/src/core/app.lua b/src/core/app.lua index 3daef4750a..f4059d6d30 100644 --- a/src/core/app.lua +++ b/src/core/app.lua @@ -76,13 +76,15 @@ busywait = false -- Profiling with vmprofile -------------------------------- +vmprofile_enabled = true + -- Low-level FFI ffi.cdef[[ int vmprofile_get_profile_size(); void vmprofile_set_profile(void *counters); ]] -local vmprofile_t = ffi.new("uint8_t["..C.vmprofile_get_profile_size().."]") +local vmprofile_t = ffi.typeof("uint8_t["..C.vmprofile_get_profile_size().."]") local vmprofiles = {} local function getvmprofile (name) @@ -96,6 +98,18 @@ function setvmprofile (name) C.vmprofile_set_profile(getvmprofile(name)) end +function clearvmprofiles () + jit.vmprofile.stop() + for name, profile in pairs(vmprofiles) do + shm.unmap(profile) + shm.unlink("vmprofile/"..name..".vmprofile") + vmprofiles[name] = nil + end + if vmprofile_enabled then + jit.vmprofile.start() + end +end + -- True when the engine is running the breathe loop. local running = false diff --git a/src/core/lib.lua b/src/core/lib.lua index fcf6a92c64..3d8b330e86 100644 --- a/src/core/lib.lua +++ b/src/core/lib.lua @@ -71,8 +71,7 @@ function writefile (filename, value) local f = io.open(filename, "w") if f == nil then error("Unable to open file: " .. filename) end local result = f:write(value) - f:close() - return result + return f:close() and result end function readlink (path) diff --git a/src/core/main.lua b/src/core/main.lua index 9cfaa17c1f..433fa3c09a 100644 --- a/src/core/main.lua +++ b/src/core/main.lua @@ -8,7 +8,7 @@ package.path = '' local STP = require("lib.lua.StackTracePlus") local ffi = require("ffi") -local vmprofile = require("jit.vmprofile") +local jit = require("jit") local lib = require("core.lib") local shm = require("core.shm") local C = ffi.C @@ -47,7 +47,10 @@ function main () error("fatal: "..ffi.os.."/"..ffi.arch.." is not a supported platform\n") end initialize() - vmprofile.start() + -- Setup audit.log, vmprofile + engine.enable_auditlog() + engine.setvmprofile("program") + jit.vmprofile.start() if lib.getenv("SNABB_PROGRAM_LUACODE") then -- Run the given Lua code instead of the command-line local expr = lib.getenv("SNABB_PROGRAM_LUACODE") @@ -67,7 +70,7 @@ function main () require(modulename(program)).run(args) end end - vmprofile.stop() + jit.vmprofile.stop() end -- Take the program name from the first argument, unless the first @@ -162,9 +165,6 @@ function initialize () _G.packet = require("core.packet"); _G.packet.initialize() _G.timer = require("core.timer") _G.main = getfenv() - -- Setup audit.log, vmprofile - engine.enable_auditlog() - engine.setvmprofile("program") end function handler (reason) @@ -185,6 +185,7 @@ function shutdown (pid) safely(function () require("core.packet").shutdown(pid) end) safely(function () require("apps.interlink.receiver").shutdown(pid) end) safely(function () require("apps.interlink.transmitter").shutdown(pid) end) + safely(function () require("apps.mellanox.connectx").shutdown(pid) end) -- Parent process performs additional cleanup steps. -- (Parent is the process whose 'group' folder is not a symlink.) diff --git a/src/core/packet.lua b/src/core/packet.lua index 1495483403..ca71a3e547 100644 --- a/src/core/packet.lua +++ b/src/core/packet.lua @@ -24,11 +24,15 @@ max_payload = tonumber(C.PACKET_PAYLOAD_SIZE) -- For operations that add or remove headers from the beginning of a -- packet, instead of copying around the payload we just move the -- packet structure as a whole around. -local packet_alignment = 512 -local default_headroom = 256 +packet_alignment = 512 +default_headroom = 256 -- The Intel82599 driver requires even-byte alignment, so let's keep -- things aligned at least this much. -local minimum_alignment = 2 +minimum_alignment = 2 + +-- Copy read-only constants to locals +local max_payload, packet_alignment, default_headroom, minimum_alignment = + max_payload, packet_alignment, default_headroom, minimum_alignment local function get_alignment (addr, alignment) -- Precondition: alignment is a power of 2. @@ -257,7 +261,7 @@ end function from_string (d) return from_pointer(d, #d) end -- Free a packet that is no longer in use. -local function free_internal (p) +function free_internal (p) local ptr = ffi.cast("char*", p) p = ffi.cast(packet_ptr_t, ptr - get_headroom(ptr) + default_headroom) p.length = 0 @@ -269,9 +273,12 @@ function account_free (p) counter.add(engine.freebytes, p.length) -- Calculate bits of physical capacity required for packet on 10GbE -- Account for minimum data size and overhead of CRC and inter-packet gap - counter.add(engine.freebits, (math.max(p.length, 46) + 4 + 5) * 8) + -- https://en.wikipedia.org/wiki/Ethernet_frame + counter.add(engine.freebits, (12 + 8 + math.max(p.length, 60) + 4) * 8) end +local free_internal, account_free = + free_internal, account_free function free (p) account_free(p) free_internal(p) diff --git a/src/lib/ctable.lua b/src/lib/ctable.lua index 6397f649c4..fb575efdc5 100644 --- a/src/lib/ctable.lua +++ b/src/lib/ctable.lua @@ -160,11 +160,14 @@ end -- hugepages, not this code. local try_huge_pages = true local huge_page_threshold = 1e6 +local huge_page_size = memory.get_huge_page_size() local function calloc(t, count) if count == 0 then return 0, 0 end local byte_size = ffi.sizeof(t) * count + local alloc_byte_size = byte_size local mem, err if try_huge_pages and byte_size > huge_page_threshold then + alloc_byte_size = ceil(byte_size/huge_page_size) * huge_page_size mem, err = S.mmap(nil, byte_size, 'read, write', 'private, anonymous, hugetlb') if not mem then @@ -179,7 +182,7 @@ local function calloc(t, count) if not mem then error("mmap failed: " .. tostring(err)) end end local ret = ffi.cast(ffi.typeof('$*', t), mem) - ffi.gc(ret, function (ptr) S.munmap(ptr, byte_size) end) + ffi.gc(ret, function (ptr) S.munmap(ptr, alloc_byte_size) end) return ret, byte_size end diff --git a/src/lib/hardware/README.md b/src/lib/hardware/README.md index 8e345e563e..1a1de8719b 100644 --- a/src/lib/hardware/README.md +++ b/src/lib/hardware/README.md @@ -71,6 +71,11 @@ Returns a table containing information about the PCI device by Returns the module name for a suitable device driver (if available) for a device of *model* from *vendor*. +— Function **pci.reset_device** *pciaddress* + +Reset a PCI device (function). Can be useful for returning the device +to a clean initial state. + — Function **pci.unbind_device_from_linux** *pciaddress* Forces Linux to unbind the device identified by *pciaddress* from any diff --git a/src/lib/hardware/pci.lua b/src/lib/hardware/pci.lua index bfe91ede56..ab1714be03 100644 --- a/src/lib/hardware/pci.lua +++ b/src/lib/hardware/pci.lua @@ -67,6 +67,7 @@ model = { ["X520"] = 'Intel X520', ["i350"] = 'Intel 350', ["i210"] = 'Intel 210', + ["X710"] = 'Intel X710', ["XL710_VF"] = 'Intel XL710/X710 Virtual Function', ["AVF"] = 'Intel AVF' } @@ -85,17 +86,24 @@ local cards = { ["0x157b"] = {model = model["i210"], driver = 'apps.intel_mp.intel_mp'}, ["0x154c"] = {model = model["XL710_VF"], driver = 'apps.intel_avf.intel_avf'}, ["0x1889"] = {model = model["AVF"], driver = 'apps.intel_avf.intel_avf'}, + ["0x1572"] = {model = model["X710"], driver = nil}, }, ["0x1924"] = { ["0x0903"] = {model = 'SFN7122F', driver = 'apps.solarflare.solarflare'} }, + ["0x15b3"] = { + ["0x1013" ] = {model = 'MT27700', driver = 'apps.mellanox.connectx'}, + ["0x1017" ] = {model = 'MT27800', driver = 'apps.mellanox.connectx'}, + ["0x1019" ] = {model = 'MT28800', driver = 'apps.mellanox.connectx'}, + ["0x101d" ] = {model = 'MT2892', driver = 'apps.mellanox.connectx'}, + }, } local link_names = { ['apps.solarflare.solarflare'] = { "rx", "tx" }, ['apps.intel_mp.intel_mp'] = { "input", "output" }, ['apps.intel_avf.intel_avf'] = { "input", "output" }, - ['apps.intel.intel_app'] = { "rx", "tx" } + ['apps.mellanox.connectx'] = { "input", "output" }, } -- Return the name of the Lua module that implements support for this device. @@ -121,6 +129,18 @@ function is_usable (info) return info.driver and (info.interface == nil or info.status == 'down') end +-- Reset a PCI function. +-- See https://www.kernel.org/doc/Documentation/ABI/testing/sysfs-bus-pci +function reset_device (pciaddress) + root_check() + local p = path(pciaddress).."/reset" + if lib.can_write(p) then + lib.writefile(p, "1") + else + error("Cannot write: "..p) + end +end + --- Force Linux to release the device with `pciaddress`. --- The corresponding network interface (e.g. `eth0`) will disappear. function unbind_device_from_linux (pciaddress) diff --git a/src/lib/numa.lua b/src/lib/numa.lua index 55b510be24..f3ab02418a 100644 --- a/src/lib/numa.lua +++ b/src/lib/numa.lua @@ -145,7 +145,9 @@ local irqbalanced_checked = false local function assert_irqbalanced_disabled (warn) if irqbalanced_checked then return end irqbalanced_checked = true - for path in os.getenv('PATH'):split(':') do + local env_path = os.getenv('PATH') + if not env_path then return end + for path in env_path:split(':') do if S.stat(path..'/irqbalance') then if S.stat('/etc/default/irqbalance') then for line in io.lines('/etc/default/irqbalance') do @@ -158,12 +160,20 @@ local function assert_irqbalanced_disabled (warn) end end +local function read_cpu_performance_governor (cpu) + local path = '/sys/devices/system/cpu/cpu'..cpu..'/cpufreq/scaling_governor' + local f = io.open(path) + if not f then return "unknown" end + local gov = f:read() + f:close() + return gov +end + local function check_cpu_performance_tuning (cpu, strict) local warn = warn if strict then warn = die end assert_irqbalanced_disabled(warn) - local path = '/sys/devices/system/cpu/cpu'..cpu..'/cpufreq/scaling_governor' - local gov = assert(io.open(path)):read() + local gov = read_cpu_performance_governor(cpu) if not gov:match('performance') then warn('Expected performance scaling governor for CPU %s, but got "%s"', cpu, gov) @@ -222,6 +232,7 @@ function bind_to_numa_node (node, policy) -- Migrate any pages that might have the wrong affinity. local from_mask = assert(S.get_mempolicy(nil, nil, nil, 'mems_allowed')).mask + from_mask[node] = false local ok, err = S.migrate_pages(0, from_mask, node) if not ok then warn("Failed to migrate pages to NUMA node %d: %s\n", diff --git a/src/lib/ptree/ptree.lua b/src/lib/ptree/ptree.lua index a6aa22232a..f8945483e9 100644 --- a/src/lib/ptree/ptree.lua +++ b/src/lib/ptree/ptree.lua @@ -110,10 +110,10 @@ function new_manager (conf) ret.rpc_callee = rpc.prepare_callee('snabb-config-leader-v1') ret.rpc_handler = rpc.dispatch_handler(ret, 'rpc_', ret.trace) - ret:set_initial_configuration(conf.initial_configuration) - ret:start() + ret:set_initial_configuration(conf.initial_configuration) + return ret end @@ -374,6 +374,14 @@ function Manager:monitor_worker_stats(id) counters.archived[0] = counters.archived[0] + val counter.delete(qualified_name) S.unlink(strip_suffix(qualified_name, ".counter")..".rrd") + local last_in_set = true + for _ in pairs(counters.active) do + last_in_set = false + break + end + if last_in_set then + self:cleanup_aggregated_stats(name, 'counter') + end end elseif has_suffix(ev.name, '.gauge') then local gauges = self.gauges[name] @@ -390,6 +398,14 @@ function Manager:monitor_worker_stats(id) gauges.active[pid] = nil gauges.rrd[pid] = nil S.unlink(strip_suffix(qualified_name, ".gauge")..".rrd") + local last_in_set = true + for _ in pairs(gauges.active) do + last_in_set = false + break + end + if last_in_set then + self:cleanup_aggregated_stats(name, 'gauge') + end end end end @@ -424,6 +440,20 @@ function Manager:sample_active_stats() end end +function Manager:cleanup_aggregated_stats(name, typ) + shm.unlink(name) + shm.unlink(strip_suffix(name, "."..typ)..".rrd") + self:cleanup_parent_directories(name) +end + +function Manager:cleanup_parent_directories(name) + local parent = name:match("(.*)/[^/]+$") + if not parent then return end + for _ in pairs(shm.children(parent)) do return end + shm.unlink(parent) + self:cleanup_parent_directories(parent) +end + function Manager:start_worker_for_graph(id, graph) local scheduling = self:compute_scheduling_for_worker(id, graph) self:info('Starting worker %s.', id) diff --git a/src/lib/ptree/support/snabb-softwire-v2.lua b/src/lib/ptree/support/snabb-softwire-v3.lua similarity index 96% rename from src/lib/ptree/support/snabb-softwire-v2.lua rename to src/lib/ptree/support/snabb-softwire-v3.lua index dd917d225e..8b3c9c6b81 100644 --- a/src/lib/ptree/support/snabb-softwire-v2.lua +++ b/src/lib/ptree/support/snabb-softwire-v3.lua @@ -19,7 +19,7 @@ local path_data = require('lib.yang.path_data') local generic = require('lib.ptree.support').generic_schema_config_support local binding_table = require("apps.lwaftr.binding_table") --- Packs snabb-softwire-v2 softwire entry into softwire and PSID blob +-- Packs snabb-softwire-v3 softwire entry into softwire and PSID blob -- -- The data plane stores a separate table of psid maps and softwires. It -- requires that we give it a blob it can quickly add. These look rather @@ -65,7 +65,7 @@ end local softwire_grammar local function get_softwire_grammar() if not softwire_grammar then - local schema = yang.load_schema_by_name('snabb-softwire-v2') + local schema = yang.load_schema_by_name('snabb-softwire-v3') local grammar = data.config_grammar_from_schema(schema) softwire_grammar = assert(grammar.members['softwire-config']. @@ -234,7 +234,7 @@ local function schema_getter(schema_name, path) end local function snabb_softwire_getter(path) - return schema_getter('snabb-softwire-v2', path) + return schema_getter('snabb-softwire-v3', path) end local function ietf_softwire_br_getter(path) @@ -323,7 +323,7 @@ local function ietf_softwire_br_translator () softwire_payload_mtu = int.mtu, softwire_path_mru = ext.mtu, -- FIXME: There's no equivalent of softwire-num-max in - -- snabb-softwire-v2. + -- snabb-softwire-v3. softwire_num_max = 0xffffffff, enable_hairpinning = int.hairpinning, binding_table = { @@ -423,7 +423,7 @@ local function ietf_softwire_br_translator () } local path_tail = path_tails[leaf] if path_tail then - return {{'set', {schema='snabb-softwire-v2', + return {{'set', {schema='snabb-softwire-v3', path='/softwire-config/'..path_tail, config=tostring(arg)}}} else @@ -439,15 +439,15 @@ local function ietf_softwire_br_translator () } local path_tail = path_tails[leaf] if path_tail then - return {{'set', {schema='snabb-softwire-v2', + return {{'set', {schema='snabb-softwire-v3', path='/softwire-config/'..path_tail, config=tostring(arg)}}} elseif leaf == 'icmpv4-rate' then local head = '/softwire-config/external-interface/error-rate-limiting' return { - {'set', {schema='snabb-softwire-v2', path=head..'/packets', + {'set', {schema='snabb-softwire-v3', path=head..'/packets', config=tostring(arg * 2)}}, - {'set', {schema='snabb-softwire-v2', path=head..'/period', + {'set', {schema='snabb-softwire-v3', path=head..'/period', config='2'}}} else error('unrecognized leaf: '..leaf) @@ -457,15 +457,15 @@ local function ietf_softwire_br_translator () then local leaf = path[#path].name if leaf == 'generate-icmpv6-errors' then - return {{'set', {schema='snabb-softwire-v2', + return {{'set', {schema='snabb-softwire-v3', path='/softwire-config/internal-interface/generate-icmp-errors', config=tostring(arg)}}} elseif leaf == 'icmpv6-rate' then local head = '/softwire-config/internal-interface/error-rate-limiting' return { - {'set', {schema='snabb-softwire-v2', path=head..'/packets', + {'set', {schema='snabb-softwire-v3', path=head..'/packets', config=tostring(arg * 2)}}, - {'set', {schema='snabb-softwire-v2', path=head..'/period', + {'set', {schema='snabb-softwire-v3', path=head..'/period', config='2'}}} else error('unrecognized leaf: '..leaf) @@ -480,7 +480,7 @@ local function ietf_softwire_br_translator () not path_has_query(path, #path) then local bt = native_binding_table_from_ietf(arg) - return {{'set', {schema='snabb-softwire-v2', + return {{'set', {schema='snabb-softwire-v3', path='/softwire-config/binding-table', config=serialize_binding_table(bt)}}} else @@ -536,7 +536,7 @@ local function ietf_softwire_br_translator () -- to add a check here that the IPv4/PSID is not present in the -- binding table. table.insert(updates, - {'remove', {schema='snabb-softwire-v2', + {'remove', {schema='snabb-softwire-v3', path=softwire_path..old_query}}) local config_str = string.format([[{ @@ -553,7 +553,7 @@ local function ietf_softwire_br_translator () path[entry_path_len].query['binding-ipv6info'], new.port_set.psid_len, new.port_set.psid_offset) table.insert(updates, - {'add', {schema='snabb-softwire-v2', + {'add', {schema='snabb-softwire-v3', path=softwire_path, config=config_str}}) return updates @@ -613,7 +613,7 @@ local function ietf_softwire_br_translator () table.insert(additions, config_str) end table.insert(updates, - {'add', {schema='snabb-softwire-v2', + {'add', {schema='snabb-softwire-v3', path=softwire_path, config=table.concat(additions, '\n')}}) return updates @@ -642,7 +642,7 @@ local function ietf_softwire_br_translator () return string.format('[ipv4=%s][psid=%s]', ipv4_ntop(ipv4), psid) end local query = q(entry.binding_ipv4_addr, entry.port_set.psid) - return {{'remove', {schema='snabb-softwire-v2', + return {{'remove', {schema='snabb-softwire-v3', path=softwire_path..query}}} else return error('unsupported path: '..path_str) diff --git a/src/lib/ptree/worker.lua b/src/lib/ptree/worker.lua index 971213f230..f188807802 100644 --- a/src/lib/ptree/worker.lua +++ b/src/lib/ptree/worker.lua @@ -100,16 +100,12 @@ function Worker:handle_actions_from_manager() end function Worker:main () - local vmprofile = require("jit.vmprofile") local stop = engine.now() + self.duration local next_time = engine.now() - -- Setup vmprofile. - engine.setvmprofile("engine") - vmprofile.start() - if not engine.auditlog_enabled then engine.enable_auditlog() end + engine.setvmprofile("engine") repeat self.breathe() if next_time < engine.now() then diff --git a/src/lib/scheduling.lua b/src/lib/scheduling.lua index ab3ac2f0d8..425f6147e6 100644 --- a/src/lib/scheduling.lua +++ b/src/lib/scheduling.lua @@ -16,7 +16,9 @@ local scheduling_opts = { cpu = {}, -- CPU index (integer). real_time = {}, -- Boolean. ingress_drop_monitor = {}, -- Action string: one of 'flush' or 'warn'. + profile = {default=true}, -- Boolean. busywait = {default=true}, -- Boolean. + enable_xdp = {}, -- Enable Snabb XDP mode (see apps.xdp.xdp). eval = {} -- String. } @@ -42,6 +44,16 @@ function sched_apply.busywait (busywait) engine.busywait = busywait end +function sched_apply.enable_xdp (opt) + if opt then require('apps.xdp.xdp').snabb_enable_xdp(opt) end +end + +function sched_apply.profile (profile) + engine.vmprofile_enabled = profile + local jit = require('jit') + if profile then jit.vmprofile.start() else jit.vmprofile.stop() end +end + function sched_apply.eval (str) loadstring(str)() end diff --git a/src/lib/timers/ingress_drop_monitor.lua b/src/lib/timers/ingress_drop_monitor.lua index c1268fc04a..b8b82e6e53 100644 --- a/src/lib/timers/ingress_drop_monitor.lua +++ b/src/lib/timers/ingress_drop_monitor.lua @@ -21,11 +21,13 @@ local IngressDropMonitor = {} function new(args) local ret = { threshold = args.threshold or 100000, + threshold_timeout = args.threshold_timeout or 10, wait = args.wait or 30, grace_period = args.grace_period or 10, action = args.action or 'flush', tips_url = args.tips_url or default_tips_url, last_flush = now(), -- Start in the grace period. + last_drop = now(), last_value = ffi.new('uint64_t[1]'), current_value = ffi.new('uint64_t[1]'), } @@ -71,6 +73,12 @@ function IngressDropMonitor:jit_flush_if_needed () self.last_value[0] = self.current_value[0] return end + if self.last_value[0] < self.current_value[0] then + self.last_drop = now() + elseif now() - self.last_drop > self.threshold_timeout then + -- Reset last_value if no drops occurred within threshold_timeout. + self.last_value[0] = self.current_value[0] + end if self.current_value[0] - self.last_value[0] < self.threshold then self.ingress_packet_drop_alarm:clear() return @@ -88,7 +96,10 @@ function IngressDropMonitor:jit_flush_if_needed () print(msg) self.ingress_packet_drop_alarm:raise({alarm_text=msg}) - if self.action == 'flush' then jit.flush() end + if self.action == 'flush' then + jit.flush() + engine.clearvmprofiles() + end end function IngressDropMonitor:timer(interval) diff --git a/src/lib/yang/data.lua b/src/lib/yang/data.lua index 2d172eb0a7..e3404c37e9 100644 --- a/src/lib/yang/data.lua +++ b/src/lib/yang/data.lua @@ -388,10 +388,9 @@ end local function struct_parser(keyword, members, ctype) local keys = {} for k,v in pairs(members) do table.insert(keys, k) end - local function init() return nil end - local function parse1(P) - local ret = {} - local expanded_members = {} + local ret, expanded_members + local function init() + ret, expanded_members = {}, {} for _,k in ipairs(keys) do if members[k].represents then -- Choice fields don't include the name of the choice block in the data. They @@ -407,6 +406,8 @@ local function struct_parser(keyword, members, ctype) expanded_members[k] = members[k] end end + end + local function parse1(P) P:skip_whitespace() P:consume("{") P:skip_whitespace() @@ -422,10 +423,6 @@ local function struct_parser(keyword, members, ctype) ret[id] = sub.parse(P, ret[id], k) P:skip_whitespace() end - for k,_ in pairs(expanded_members) do - local id = normalize_id(k) - ret[id] = expanded_members[k].finish(ret[id], k) - end return ret end local function parse(P, out) @@ -434,12 +431,14 @@ local function struct_parser(keyword, members, ctype) end local struct_t = ctype and typeof(ctype) local function finish(out, leaf) + for k,_ in pairs(expanded_members) do + out = out or {} + local id = normalize_id(k) + out[id] = expanded_members[k].finish(out[id], k) + end -- FIXME check mandatory values. if struct_t then - local ret - if out == nil then ret = struct_t() - else ret = struct_t(out) end - return ret + return struct_t(out) else return out end @@ -633,7 +632,9 @@ local function table_parser(keyword, keys, values, native_key, key_ctype, return assoc end local function finish(assoc) - return assoc:finish() + if assoc then + return assoc:finish() + end end return {init=init, parse=parse, finish=finish} end @@ -870,7 +871,7 @@ function xpath_printer_from_grammar(production, print_default, root) print_yang_string(k, file) file:write(' ') end - local function body_printer(productions, order) + local function body_printer(productions) -- Iterate over productions trying to translate to other statements. This -- is used for example in choice statements raising the lower statements -- in case blocks up to the level of the choice, in place of the choice. @@ -885,11 +886,9 @@ function xpath_printer_from_grammar(production, print_default, root) end end productions = translated - if not order then - order = {} - for k,_ in pairs(productions) do table.insert(order, k) end - table.sort(order) - end + local order = {} + for k,_ in pairs(productions) do table.insert(order, k) end + table.sort(order) local printers = {} for keyword,production in pairs(productions) do local printer = printer(keyword, production, printers) @@ -904,8 +903,8 @@ function xpath_printer_from_grammar(production, print_default, root) end end end - local function key_composer (productions, order) - local printer = body_printer(productions, order) + local function key_composer (productions) + local printer = body_printer(productions) local file = {t={}} function file:write (str) str = str:match("([^%s]+)") @@ -958,13 +957,8 @@ function xpath_printer_from_grammar(production, print_default, root) -- As a special case, the table handler allows the keyword to be nil, -- for printing tables at the top level without keywords. function handlers.table(keyword, production) - local key_order, value_order = {}, {} - for k,_ in pairs(production.keys) do table.insert(key_order, k) end - for k,_ in pairs(production.values) do table.insert(value_order, k) end - table.sort(key_order) - table.sort(value_order) - local compose_key = key_composer(production.keys, key_order) - local print_value = body_printer(production.values, value_order) + local compose_key = key_composer(production.keys) + local print_value = body_printer(production.values) if production.key_ctype and production.value_ctype then return function(data, file, path) path = path or '' @@ -1120,7 +1114,7 @@ function influxdb_printer_from_grammar(production, print_default, root) file:write(file.is_tag and value or ' value='..value) file:write('\n') end - local function body_printer(productions, order) + local function body_printer(productions) -- Iterate over productions trying to translate to other statements. This -- is used for example in choice statements raising the lower statements -- in case blocks up to the level of the choice, in place of the choice. @@ -1135,11 +1129,9 @@ function influxdb_printer_from_grammar(production, print_default, root) end end productions = translated - if not order then - order = {} - for k,_ in pairs(productions) do table.insert(order, k) end - table.sort(order) - end + local order = {} + for k,_ in pairs(productions) do table.insert(order, k) end + table.sort(order) local printers = {} for keyword,production in pairs(productions) do local printer = printer(keyword, production, printers) @@ -1159,8 +1151,8 @@ function influxdb_printer_from_grammar(production, print_default, root) :gsub(',', '\\,') :gsub(' ', '\\ ') end - local function key_composer (productions, order) - local printer = body_printer(productions, order) + local function key_composer (productions) + local printer = body_printer(productions) local file = {t={}, is_tag=true} function file:write (str) str = str:match("([^%s]+)") @@ -1224,14 +1216,9 @@ function influxdb_printer_from_grammar(production, print_default, root) -- As a special case, the table handler allows the keyword to be nil, -- for printing tables at the top level without keywords. function handlers.table(keyword, production) - local key_order, value_order = {}, {} - for k,_ in pairs(production.keys) do table.insert(key_order, k) end - for k,_ in pairs(production.values) do table.insert(value_order, k) end - table.sort(key_order) - table.sort(value_order) local is_key_unique = is_key_unique(production) - local compose_key = key_composer(production.keys, key_order) - local print_value = body_printer(production.values, value_order) + local compose_key = key_composer(production.keys) + local print_value = body_printer(production.values) if production.key_ctype and production.value_ctype then return function(data, file, path) path = path or '' @@ -1361,7 +1348,7 @@ function data_printer_from_grammar(production, print_default) print_yang_string(k, file) file:write(' ') end - local function body_printer(productions, order) + local function body_printer(productions) -- Iterate over productions trying to translate to other statements. This -- is used for example in choice statements raising the lower statements -- in case blocks up to the level of the choice, in place of the choice. @@ -1376,11 +1363,9 @@ function data_printer_from_grammar(production, print_default) end end productions = translated - if not order then - order = {} - for k,_ in pairs(productions) do table.insert(order, k) end - table.sort(order) - end + local order = {} + for k,_ in pairs(productions) do table.insert(order, k) end + table.sort(order) local printers = {} for keyword,production in pairs(productions) do local printer = printer(keyword, production, printers) @@ -1426,13 +1411,8 @@ function data_printer_from_grammar(production, print_default) -- As a special case, the table handler allows the keyword to be nil, -- for printing tables at the top level without keywords. function handlers.table(keyword, production) - local key_order, value_order = {}, {} - for k,_ in pairs(production.keys) do table.insert(key_order, k) end - for k,_ in pairs(production.values) do table.insert(value_order, k) end - table.sort(key_order) - table.sort(value_order) - local print_key = body_printer(production.keys, key_order) - local print_value = body_printer(production.values, value_order) + local print_key = body_printer(production.keys) + local print_value = body_printer(production.values) if production.key_ctype and production.value_ctype then return function(data, file, indent) for entry in data:iterate() do @@ -1761,6 +1741,15 @@ function selftest() description "Address prefixes bound to this interface."; } + + list choices { + key id; + leaf id { type string; } + choice choice { + leaf red { type string; } + leaf blue { type string; } + } + } }]]) local data = load_config_for_schema(test_schema, @@ -1774,6 +1763,9 @@ function selftest() } addr 1.2.3.4; address 1.2.3.4/24; + choices { id "one"; blue "hey"; } + choices { id "two"; red "bye"; } + ]]) for i =1,2 do assert(data.fruit_bowl.description == 'ohai') @@ -1786,6 +1778,8 @@ function selftest() assert(contents.baz.score == 9) assert(contents.baz.tree_grown == true) assert(data.addr == util.ipv4_pton('1.2.3.4')) + assert(data.choices.one.blue == "hey") + assert(data.choices.two.red == "bye") local stream = mem.tmpfile() print_config_for_schema(test_schema, data, stream) @@ -1843,6 +1837,44 @@ function selftest() ]]) assert(object.summary.shelves_active) + -- Test nested defaults + local default_schema = [[module default-schema { + namespace "urn:ietf:params:xml:ns:yang:default-schema"; + prefix "default"; + + container optional { + leaf default { + type string; + default "foo"; + } + } + }]] + local loaded_schema = schema.load_schema(default_schema) + local object = load_config_for_schema(loaded_schema, + mem.open_input_string "") + assert(object.optional) + assert(object.optional.default == "foo") + + local default2_schema = [[module default2-schema { + namespace "urn:ietf:params:xml:ns:yang:default2-schema"; + prefix "default"; + + container optional1 { + container optional2 { + leaf default { + type string; + default "foo"; + } + } + } + }]] + local loaded_schema = schema.load_schema(default2_schema) + local object = load_config_for_schema(loaded_schema, + mem.open_input_string "") + assert(object.optional1) + assert(object.optional1.optional2) + assert(object.optional1.optional2.default == "foo") + -- Test choice field. local choice_schema = schema.load_schema([[module choice-schema { namespace "urn:ietf:params:xml:ns:yang:choice-schema"; diff --git a/src/lib/yang/path_data.lua b/src/lib/yang/path_data.lua index c85bd352f4..991d115b10 100644 --- a/src/lib/yang/path_data.lua +++ b/src/lib/yang/path_data.lua @@ -562,6 +562,25 @@ local function pairs_from_grammar(grammar) end end +local function expanded_pairs(values) + -- Return an iterator for each non-choice pair in values and each pair of + -- all choice bodies recursively. + local expanded = {} + local function expand(values) + for name, value in pairs(values) do + if value.type == 'choice' then + for _, body in pairs(value.choices) do + expand(body) + end + else + expanded[name] = value + end + end + end + expand(values) + return pairs(expanded) +end + function uniqueness_checker_from_grammar(grammar) -- Generate checker for table local function unique_assertion(leaves, grammar) @@ -593,7 +612,7 @@ function uniqueness_checker_from_grammar(grammar) elseif grammar.type == 'table' then local pairs = pairs_from_grammar(grammar) -- visit values - for name, value in _G.pairs(grammar.values) do + for name, value in expanded_pairs(grammar.values) do for k, datum in pairs(data) do visit_unique_and_check(value, datum[normalize_id(name)]) end @@ -604,7 +623,7 @@ function uniqueness_checker_from_grammar(grammar) end elseif grammar.type == 'struct' then -- visit members - for name, member in pairs(grammar.members) do + for name, member in expanded_pairs(grammar.members) do visit_unique_and_check(member, data[normalize_id(name)]) end end @@ -648,7 +667,7 @@ function minmax_elements_checker_from_grammar(grammar) elseif grammar.type == 'table' then -- visit values local pairs = pairs_from_grammar(grammar) - for name, value in _G.pairs(grammar.values) do + for name, value in expanded_pairs(grammar.values) do for k, datum in pairs(data) do visit_minmax_and_check(value, datum[normalize_id(name)], name) end @@ -657,7 +676,7 @@ function minmax_elements_checker_from_grammar(grammar) minmax_assertion(grammar, name)(data) elseif grammar.type == 'struct' then -- visit members - for name, member in pairs(grammar.members) do + for name, member in expanded_pairs(grammar.members) do visit_minmax_and_check(member, data[normalize_id(name)], name) end end @@ -834,7 +853,7 @@ function selftest() local checker = consistency_checker_from_schema_by_name('ietf-alarms', false) assert(checker) - local scm = schema.load_schema_by_name('snabb-softwire-v2') + local scm = schema.load_schema_by_name('snabb-softwire-v3') local grammar = data.config_grammar_from_schema(scm) setter_for_grammar(grammar, "/softwire-config/instance[device=test]/".. "queue[id=0]/external-interface/ip 208.118.235.148") @@ -1002,5 +1021,73 @@ function selftest() assert(not success) print(result) + -- Test unique restrictions in choice body: + local choice_unique_schema = schema.load_schema([[module choice-unique-schema { + namespace "urn:ietf:params:xml:ns:yang:choice-unique-schema"; + prefix "test"; + + choice ab { + list unique_test { + key "testkey"; unique "testleaf testleaf2"; + leaf testkey { type string; mandatory true; } + leaf testleaf { type string; mandatory true; } + leaf testleaf2 { type string; mandatory true; } + } + list duplicate_test { + key "testkey"; + leaf testkey { type string; mandatory true; } + leaf testleaf { type string;} + leaf testleaf2 { type string;} + } + } + }]]) + local checker = consistency_checker_from_schema(choice_unique_schema, true) + + -- Test unique validation in choice body (should fail) + local success, result = pcall( + checker, + data.load_config_for_schema(choice_unique_schema, + mem.open_input_string [[ + unique_test { + testkey "foo"; + testleaf "bar"; + testleaf2 "baz"; + } + unique_test { + testkey "foo2"; + testleaf "bar"; + testleaf2 "baz"; + } + ]])) + assert(not success) + + -- Test unique validation in choice body (should succeed) + checker(data.load_config_for_schema(choice_unique_schema, + mem.open_input_string [[ + unique_test { + testkey "foo"; + testleaf "bar"; + testleaf2 "baz"; + } + unique_test { + testkey "foo2"; + testleaf "bar2"; + testleaf2 "baz"; + } + ]])) + + -- Test unique validation in choice body (should succeed) + checker(data.load_config_for_schema(choice_unique_schema, + mem.open_input_string [[ + duplicate_test { + testkey "foo"; + testleaf "bar"; + } + duplicate_test { + testkey "foo2"; + testleaf "bar"; + } + ]])) + print("selftest: ok") end diff --git a/src/lib/yang/schema.lua b/src/lib/yang/schema.lua index 21f5ad1af4..c3b121cb65 100644 --- a/src/lib/yang/schema.lua +++ b/src/lib/yang/schema.lua @@ -1303,7 +1303,7 @@ function selftest() load_schema_by_name('ietf-softwire-common') load_schema_by_name('ietf-softwire-br') - load_schema_by_name('snabb-softwire-v2') + load_schema_by_name('snabb-softwire-v3') local br = load_schema_by_name('ietf-softwire-br') local binding = br.body['br-instances'].body['br-type'].body['binding'] diff --git a/src/lib/yang/snabb-softwire-v3.yang b/src/lib/yang/snabb-softwire-v3.yang new file mode 100644 index 0000000000..c4b4eb5cdc --- /dev/null +++ b/src/lib/yang/snabb-softwire-v3.yang @@ -0,0 +1,996 @@ +module snabb-softwire-v3 { + yang-version 1.1; + namespace snabb:softwire-v3; + prefix softwire; + + import ietf-inet-types { prefix inet; } + import ietf-yang-types { prefix yang; } + + organization "Snabb"; + contact "Max Rottenkolber "; + description + "Configuration for the Snabb lwAFTR."; + + revision 2021-11-08 { + description + "Change module+namespace to v3. Update organization and contact. + Fix mistakes in leaf descriptions. + Add default value for error-rate-limiting/packets. + Allow more than two queues (lift id leaf range restriction). + Move leaf external-interface/device up as external-device. + Add softwire-state/{in,out}-icmpv{4,6}-echo-{bytes,packets}, counters. + Add softwire-state/{in,out}-arp-{request,reply}-{bytes,packets}, counters. + Add softwire-state/{in,out}-ndp-{ns,na}-{bytes,packets}, counters. + Renamed softwire-state/{in,out}-icmpv{4,6}-{bytes,packets} + to softwire-state/{in,out}-icmpv{4,6}-error-{bytes,packets}."; + } + + revision 2019-09-17 { + description + "Add discontinuity time to softwire-state."; + } + + revision 2018-10-13 { + description + "Add flow-label setting."; + } + + revision 2017-04-17 { + description + "Removal of br-address leaf-list and br leaf. It adds the + addition of br-address binding_table.softwire. This is to + make the schema more yang-like. One now only need to specify + the br-address on the softwire rather than managing the index's + to a leaf-list of them. + + This also removes the psid-map list and adds a new port-set + container on the softwire container instead. This will help + adding the softwires as well as bring it more inline with the + ietf-softwire schema. + + The addition of /softwire-config/instance allows for configuring + multiple instances of the lwAFTR with a shared binding table and + other common configuration properties."; + } + + revision 2016-11-04 { + description + "Initial revision."; + } + + grouping state-counters { + container softwire-state { + + description "State data about interface."; + config false; + + leaf discontinuity-time { + type yang:date-and-time; + mandatory true; + description + "The time of the most recent occasion on which the lwaftr instance + suffered a discontinuity. This is set to the current time whenever + the lwaftr instance is started or configured."; + } + + leaf drop-all-ipv4-iface-bytes { + type yang:zero-based-counter64; + description + "All dropped packets and bytes that came in over IPv4 interfaces, + whether or not they actually IPv4 (they only include data about + packets that go in/out over the wires, excluding internally generated + ICMP packets)."; + } + leaf drop-all-ipv4-iface-packets { + type yang:zero-based-counter64; + description + "All dropped packets and bytes that came in over IPv4 interfaces, + whether or not they actually IPv4 (they only include data about + packets that go in/out over the wires, excluding internally generated + ICMP packets)."; + } + leaf drop-all-ipv6-iface-bytes { + type yang:zero-based-counter64; + description + "All dropped packets and bytes that came in over IPv6 interfaces, + whether or not they actually IPv6 (they only include data about packets + that go in/out over the wires, excluding internally generated ICMP + packets)."; + } + leaf drop-all-ipv6-iface-packets { + type yang:zero-based-counter64; + description + "All dropped packets and bytes that came in over IPv6 interfaces, + whether or not they actually IPv6 (they only include data about packets + that go in/out over the wires, excluding internally generated ICMP + packets)."; + } + leaf drop-bad-checksum-icmpv4-bytes { + type yang:zero-based-counter64; + description "ICMPv4 packets dropped because of a bad checksum."; + } + leaf drop-bad-checksum-icmpv4-packets { + type yang:zero-based-counter64; + description "ICMPv4 packets dropped because of a bad checksum."; + } + leaf drop-in-by-policy-icmpv4-bytes { + type yang:zero-based-counter64; + description "Incoming ICMPv4 packets dropped because of current policy."; + } + leaf drop-in-by-policy-icmpv4-packets { + type yang:zero-based-counter64; + description "Incoming ICMPv4 packets dropped because of current policy."; + } + leaf drop-in-by-policy-icmpv6-bytes { + type yang:zero-based-counter64; + description "Incoming ICMPv6 packets dropped because of current policy."; + } + leaf drop-in-by-policy-icmpv6-packets { + type yang:zero-based-counter64; + description "Incoming ICMPv6 packets dropped because of current policy."; + } + leaf drop-in-by-rfc7596-icmpv4-bytes { + type yang:zero-based-counter64; + description + "Incoming ICMPv4 packets with no destination (RFC 7596 section 8.1)."; + } + leaf drop-in-by-rfc7596-icmpv4-packets { + type yang:zero-based-counter64; + description + "Incoming ICMPv4 packets with no destination (RFC 7596 section 8.1)."; + } + leaf drop-ipv4-frag-disabled { + type yang:zero-based-counter64; + description + "If fragmentation is disabled, the only potentially non-zero IPv4 + fragmentation counter is drop-ipv4-frag-disabled. If fragmentation is + enabled, it will always be zero."; + } + leaf drop-ipv4-frag-invalid-reassembly { + type yang:zero-based-counter64; + description + "Two or more IPv4 fragments were received, and reassembly was started, + but was invalid and dropped. Causes include multiple fragments claiming + they are the last fragment, overlapping fragment offsets, or the packet + was being reassembled from too many fragments (the setting is + max_fragments_per_reassembly_packet, and the default is that no packet + should be reassembled from more than 40)."; + } + leaf drop-ipv4-frag-random-evicted { + type yang:zero-based-counter64; + description + "Reassembling an IPv4 packet from fragments was in progress, but the + configured amount of packets to reassemble at once was exceeded, so one + was dropped at random. Consider increasing the setting + max_ipv4_reassembly_packets."; + } + leaf drop-ipv6-frag-disabled { + type yang:zero-based-counter64; + description + "If fragmentation is disabled, the only potentially non-zero IPv6 + fragmentation counter is drop-ipv6-frag-disabled. If fragmentation is + enabled, it will always be zero."; + } + leaf drop-ipv6-frag-invalid-reassembly { + type yang:zero-based-counter64; + description + "Two or more IPv6 fragments were received, and reassembly was started, + but was invalid and dropped. Causes include multiple fragments claiming + they are the last fragment, overlapping fragment offsets, or the packet + was being reassembled from too many fragments (the setting is + max_fragments_per_reassembly_packet, and the default is that no packet + should be reassembled from more than 40)."; + } + leaf drop-ipv6-frag-random-evicted { + type yang:zero-based-counter64; + description + "Reassembling an IPv6 packet from fragments was in progress, but the + configured amount of packets to reassemble at once was exceeded, so one + was dropped at random. Consider increasing the setting + max_ipv6_reassembly_packets."; + } + leaf drop-misplaced-not-ipv4-bytes { + type yang:zero-based-counter64; + description "Non-IPv4 packets incoming on the IPv4 link."; + } + leaf drop-misplaced-not-ipv4-packets { + type yang:zero-based-counter64; + description "Non-IPv4 packets incoming on the IPv4 link."; + } + leaf drop-misplaced-not-ipv6-bytes { + type yang:zero-based-counter64; + description "Non-IPv6 packets incoming on IPv6 link."; + } + leaf drop-misplaced-not-ipv6-packets { + type yang:zero-based-counter64; + description "Non-IPv6 packets incoming on IPv6 link."; + } + leaf drop-no-dest-softwire-ipv4-bytes { + type yang:zero-based-counter64; + description + "No matching destination softwire in the binding table; incremented + whether or not the reason was RFC7596."; + } + leaf drop-no-dest-softwire-ipv4-packets { + type yang:zero-based-counter64; + description + "No matching destination softwire in the binding table; incremented + whether or not the reason was RFC7596."; + } + leaf drop-no-source-softwire-ipv6-bytes { + type yang:zero-based-counter64; + description + "No matching source softwire in the binding table; incremented whether + or not the reason was RFC7596."; + } + leaf drop-no-source-softwire-ipv6-packets { + type yang:zero-based-counter64; + description + "No matching source softwire in the binding table; incremented whether + or not the reason was RFC7596."; + } + leaf drop-out-by-policy-icmpv4-packets { + type yang:zero-based-counter64; + description + "Internally generated ICMPv4 error packets dropped because of current + policy."; + } + leaf drop-out-by-policy-icmpv6-packets { + type yang:zero-based-counter64; + description + "Internally generated ICMPv6 packets dropped because of current + policy."; + } + leaf drop-over-mtu-but-dont-fragment-ipv4-bytes { + type yang:zero-based-counter64; + description + "IPv4 packets whose size exceeded the MTU, but the DF (Don't Fragment) + flag was set."; + } + leaf drop-over-mtu-but-dont-fragment-ipv4-packets { + type yang:zero-based-counter64; + description + "IPv4 packets whose size exceeded the MTU, but the DF (Don't Fragment) + flag was set."; + } + leaf drop-over-rate-limit-icmpv6-bytes { + type yang:zero-based-counter64; + description + "Packets dropped because the outgoing ICMPv6 rate limit was reached."; + } + leaf drop-over-rate-limit-icmpv6-packets { + type yang:zero-based-counter64; + description + "Packets dropped because the outgoing ICMPv6 rate limit was reached."; + } + leaf drop-over-time-but-not-hop-limit-icmpv6-bytes { + type yang:zero-based-counter64; + description + "Packet's time limit was exceeded, but the hop limit was not."; + } + leaf drop-over-time-but-not-hop-limit-icmpv6-packets { + type yang:zero-based-counter64; + description + "Packet's time limit was exceeded, but the hop limit was not."; + } + leaf drop-too-big-type-but-not-code-icmpv6-bytes { + type yang:zero-based-counter64; + description + "Packet's ICMP type was 'Packet too big' but its ICMP code was not an + acceptable one for this type."; + } + leaf drop-too-big-type-but-not-code-icmpv6-packets { + type yang:zero-based-counter64; + description + "Packet's ICMP type was 'Packet too big' but its ICMP code was not an + acceptable one for this type."; + } + leaf drop-ttl-zero-ipv4-bytes { + type yang:zero-based-counter64; + description "IPv4 packets dropped because their TTL was zero."; + } + leaf drop-ttl-zero-ipv4-packets { + type yang:zero-based-counter64; + description "IPv4 packets dropped because their TTL was zero."; + } + leaf drop-unknown-protocol-icmpv6-bytes { + type yang:zero-based-counter64; + description "Packets with an unknown ICMPv6 protocol."; + } + leaf drop-unknown-protocol-icmpv6-packets { + type yang:zero-based-counter64; + description "Packets with an unknown ICMPv6 protocol."; + } + leaf drop-unknown-protocol-ipv6-bytes { + type yang:zero-based-counter64; + description "Packets with an unknown IPv6 protocol."; + } + leaf drop-unknown-protocol-ipv6-packets { + type yang:zero-based-counter64; + description "Packets with an unknown IPv6 protocol."; + } + leaf hairpin-ipv4-bytes { + type yang:zero-based-counter64; + description "IPv4 packets going to a known b4 (hairpinned)."; + } + leaf hairpin-ipv4-packets { + type yang:zero-based-counter64; + description "IPv4 packets going to a known b4 (hairpinned)."; + } + leaf in-ipv4-bytes { + type yang:zero-based-counter64; + description "Valid incoming IPv4 bytes."; + } + leaf in-ipv4-frag-needs-reassembly { + type yang:zero-based-counter64; + description "An IPv4 fragment was received."; + } + leaf in-ipv4-frag-reassembled { + type yang:zero-based-counter64; + description "A packet was successfully reassembled from IPv4 fragments."; + } + leaf in-ipv4-frag-reassembly-unneeded { + type yang:zero-based-counter64; + description + "An IPv4 packet which was not a fragment was received - consequently, + it did not need to be reassembled. This should be the usual case."; + } + leaf in-ipv4-packets { + type yang:zero-based-counter64; + description "Validalid incoming IPv4 packets."; + } + leaf in-ipv6-bytes { + type yang:zero-based-counter64; + description "Valid incoming IPv6 bytes."; + } + leaf in-ipv6-frag-needs-reassembly { + type yang:zero-based-counter64; + description "An IPv6 fragment was received."; + } + leaf in-ipv6-frag-reassembled { + type yang:zero-based-counter64; + description "A packet was successfully reassembled from IPv6 fragments."; + } + leaf in-ipv6-frag-reassembly-unneeded { + type yang:zero-based-counter64; + description + "An IPv6 packet which was not a fragment was received - consequently, it + did not need to be reassembled. This should be the usual case."; + } + leaf in-ipv6-packets { + type yang:zero-based-counter64; + description "Valid incoming IPv6 packets."; + } + leaf ingress-packet-drops { + type yang:zero-based-counter64; + description "Packets dropped due to ingress filters."; + } + leaf memuse-ipv4-frag-reassembly-buffer { + type yang:zero-based-counter64; + description + "The amount of memory being used by the statically sized data structure + for reassembling IPv4 fragments. This is directly proportional to the + setting max_ipv4_reassembly_packets."; + } + leaf memuse-ipv6-frag-reassembly-buffer { + type yang:zero-based-counter64; + description + "The amount of memory being used by the statically sized data structure + for reassembling IPv6 fragments. This is directly proportional to the + setting max_ipv6_reassembly_packets."; + } + leaf in-arp-request-bytes { + type yang:zero-based-counter64; + description "Incoming ARP request bytes."; + } + leaf in-arp-request-packets { + type yang:zero-based-counter64; + description "Incoming ARP request packets."; + } + leaf out-arp-request-bytes { + type yang:zero-based-counter64; + description "Internally generated ARP request bytes."; + } + leaf out-arp-request-packets { + type yang:zero-based-counter64; + description "Internally generated ARP request packets."; + } + leaf in-arp-reply-bytes { + type yang:zero-based-counter64; + description "Incoming ARP reply bytes."; + } + leaf in-arp-reply-packets { + type yang:zero-based-counter64; + description "Incoming ARP reply packets."; + } + leaf out-arp-reply-bytes { + type yang:zero-based-counter64; + description "Internally generated ARP reply bytes."; + } + leaf out-arp-reply-packets { + type yang:zero-based-counter64; + description "Internally generated ARP reply packets."; + } + leaf in-ndp-ns-bytes { + type yang:zero-based-counter64; + description "Incoming NDP neighbor solicitation bytes."; + } + leaf in-ndp-ns-packets { + type yang:zero-based-counter64; + description "Incoming NDP neighbor solicitation packets."; + } + leaf out-ndp-ns-bytes { + type yang:zero-based-counter64; + description "Internally generated NDP neighbor solicitation bytes."; + } + leaf out-ndp-ns-packets { + type yang:zero-based-counter64; + description "Internally generated NDP neighbor solicitation packets."; + } + leaf in-ndp-na-bytes { + type yang:zero-based-counter64; + description "Incoming NDP neighbot advertisement bytes."; + } + leaf in-ndp-na-packets { + type yang:zero-based-counter64; + description "Incoming NDP neighbot advertisement packets."; + } + leaf out-ndp-na-bytes { + type yang:zero-based-counter64; + description "Internally generated NDP neighbot advertisement bytes."; + } + leaf out-ndp-na-packets { + type yang:zero-based-counter64; + description "Internally generated NDP neighbot advertisement packets."; + } + leaf out-icmpv4-error-bytes { + type yang:zero-based-counter64; + description "Internally generated ICMPv4 error bytes."; + } + leaf out-icmpv4-error-packets { + type yang:zero-based-counter64; + description "Internally generated ICMPv4 error packets."; + } + leaf out-icmpv6-error-bytes { + type yang:zero-based-counter64; + description "Internally generted ICMPv6 error bytes."; + } + leaf out-icmpv6-error-packets { + type yang:zero-based-counter64; + description "Internally generted ICMPv6 error packets."; + } + leaf in-icmpv4-echo-bytes { + type yang:zero-based-counter64; + description "Valid incoming ICMPv4 echo request bytes."; + } + leaf in-icmpv4-echo-packets { + type yang:zero-based-counter64; + description "Valid incoming ICMPv4 echo request packets."; + } + leaf out-icmpv4-echo-bytes { + type yang:zero-based-counter64; + description "Internally generated ICMPv4 echo reply bytes."; + } + leaf out-icmpv4-echo-packets { + type yang:zero-based-counter64; + description "Internally generated ICMPv4 echo reply packets."; + } + leaf in-icmpv6-echo-bytes { + type yang:zero-based-counter64; + description "Valid incoming ICMPv6 echo request bytes."; + } + leaf in-icmpv6-echo-packets { + type yang:zero-based-counter64; + description "Valid incoming ICMPv6 echo request packets."; + } + leaf out-icmpv6-echo-bytes { + type yang:zero-based-counter64; + description "Internally generted ICMPv6 echo reply bytes."; + } + leaf out-icmpv6-echo-packets { + type yang:zero-based-counter64; + description "Internally generted ICMPv6 echo reply packets."; + } + leaf out-ipv4-bytes { + type yang:zero-based-counter64; + description "Valid outgoing IPv4 bytes."; + } + leaf out-ipv4-frag { + type yang:zero-based-counter64; + description + "An outgoing packet exceeded the configured IPv4 MTU, so needed to be + fragmented. This may happen, but should be unusual."; + } + leaf out-ipv4-frag-not { + type yang:zero-based-counter64; + description + "An outgoing packet was small enough to pass through unfragmented - this + should be the usual case."; + } + leaf out-ipv4-packets { + type yang:zero-based-counter64; + description "Valid outgoing IPv4 packets."; + } + leaf out-ipv6-bytes { + type yang:zero-based-counter64; + description "Valid outgoing IPv6 bytes."; + } + leaf out-ipv6-frag { + type yang:zero-based-counter64; + description + "An outgoing packet exceeded the configured IPv6 MTU, so needed to be + fragmented. This may happen, but should be unusual."; + } + leaf out-ipv6-frag-not { + type yang:zero-based-counter64; + description + "An outgoing packet was small enough to pass through unfragmented - this + should be the usual case."; + } + leaf out-ipv6-packets { + type yang:zero-based-counter64; + description "Valid outgoing IPv6 packets."; + } + } + } + + container softwire-config { + description + "Configuration for Snabb lwaftr."; + + leaf name { + type string; + description + "Name of lwAFTR instance. This must be unique amongst the Snabb + processes on the system. This may be specified either here, in the + YANG configuration or via the command line when the lwAFTR is started. + + The order of presidence for this leaf is as followers: + 1. The name set on an already running lwAFTR instance via snabb set. + 2. A command line option to specify the name upon starting the lwAFTR + instance (i.e. overriding this value). + 3. The value here in the configuration when starting a lwaftr instance. + + If no name is specified the lwaftr can be referred to using the PID of + the lwAFTR process on the system."; + } + + grouping traffic-filters { + description + "Ingress and egress filters describing the set of packets + that should be allowed to pass, as pflang filters. pflang + is the language of tcpdump, libpcap and other tools. Note + that if VLAN tagging is enabled, the filters run on packets + after VLAN tags have been stripped off."; + leaf ingress-filter { + type string; + description + "Filter for incoming traffic. Packets that do not match + the filter will be silently dropped."; + } + leaf egress-filter { + type string; + description + "Filter for outgoing traffic. Packets that do not match + the filter will be silently dropped."; + } + } + + grouping icmp-policy { + description + "The lwAFTR can be configured to allow or drop incoming ICMP + messages, and to generate outgoing ICMP error messages or + not."; + + leaf allow-incoming-icmp { + type boolean; + default true; + description + "Whether to allow incoming ICMP packets."; + } + + leaf generate-icmp-errors { + type boolean; + default true; + description + "Whether to generate outgoing ICMP error messages."; + } + } + + grouping vlan-tagging { + description + "802.1Q Ethernet tagging."; + + leaf vlan-tag { + type uint16 { + range 0..4095; + } + description + "802.1Q Ethernet VLAN tag for this interface."; + } + } + + grouping error-rate-limiting { + description + "These settings limit the rate of ICMP error message + transmission."; + + container error-rate-limiting { + leaf packets { + type uint32; + default 200; + description + "The number of ICMP error messages which can be sent within + the specified time period."; + } + + leaf period { + type uint32 { range 1..max; } + default 2; + description + "The time period given in seconds."; + } + } + } + + grouping reassembly { + description + "These settings limit the resources devoted to reassembling + fragmented packets."; + + container reassembly { + leaf max-fragments-per-packet { + type uint32 { range 1..max; } + default 20; + description + "The maximum number of fragments per reassembled packet. + Attempts to reassemble a packet using more fragments than + this threshold will fail and the reassembly data will be + discarded."; + } + + leaf max-packets { + type uint32; + default 20000; + description + "The maximum number of concurrent reassembly attempts. If + this limit is reached, an additional reassembly will cause + random eviction of an ongoing reassembly. Note that this + setting directly affects memory usage; the memory buffer + allocated to reassembly is this maximum number of + reassemblies times 25 kilobytes each."; + } + } + } + + + list instance { + description + "Provides configuration for specific instances of the lwAFTR. + These configuration options will only affect the specific lwaftr + with the given name specified in the name leaf. The other options + not present in this list are shared amongst all instances."; + + key "device"; + + leaf device { + type string; + description + "The PCI device the instance should use during lwAFTR operation. If + device is configured in on-a-stick mode, 'external-device' + should not be configured. If 'external-device' is + specified this option should specify the PCI device of the + 'internal-interface' (IPv6 traffic only)."; + } + + leaf external-device { + type string; + description + "PCI device the instance should use for the 'external-interface' + (IPv4 traffic only). If this is left unspecified the lwAFTR + configures itself in on-a-stick mode."; + } + + list queue { + description "List of Receive-Side Scaling (RSS) queues."; + key "id"; + + leaf id { + type uint8; + description + "RSS queue on which to attach. Traffic will be partitioned + evenly between instances servicing queues on the same + interface. The queue to which an incoming packet is assigned + is a function of the TCP or UDP source and destination ports + (if any) and the source and destination IPv4 or IPv6 + addresses. Fragmented packets will be delivered to the + lowest-numbered queue."; + } + + container external-interface { + leaf ip { + type inet:ipv4-address; + mandatory true; + description + "L3 Address of the internet-facing network interface. Used + when generating error messages and responding to ICMP echo + requests."; + } + leaf mac { + type yang:mac-address; + mandatory true; + description + "MAC address of the internet-facing NIC."; + } + + uses vlan-tagging; + + container next-hop { + choice address { + mandatory true; + case ip { + leaf ip { + type inet:ipv4-address; + description + "IPv4 address of the next hop for the internet-facing NIC. + The lwAFTR will resolve this to a MAC address using ARP."; + } + leaf resolved-mac { + config false; + description "Resolved next-hop mac address found by ARP."; + type yang:mac-address; + } + } + case mac { + leaf mac { + type yang:mac-address; + description + "Statically configured MAC address of the next hop for the + internet-facing NIC."; + } + } + } + } + } + + container internal-interface { + leaf ip { + type inet:ipv6-address; + mandatory true; + description + "L3 Address of the internal-facing network interface. Used + when generating error messages and responding to ICMP echo + requests."; + } + leaf mac { + type yang:mac-address; + mandatory true; + description + "MAC address of the internal-facing NIC."; + } + + uses vlan-tagging; + + + container next-hop { + choice address { + mandatory true; + case ip { + leaf ip { + type inet:ipv6-address; + description + "IPv6 address of the next hop for the internal-facing NIC. + The lwAFTR will resolve this to a MAC address using NDP."; + } + leaf resolved-mac { + config false; + description "Resolved next-hop mac address found by NDP."; + type yang:mac-address; + } + } + case mac { + leaf mac { + type yang:mac-address; + description + "Statically configured MAC address of the next hop for the + internal-facing NIC."; + } + } + } + } + } + } + + uses state-counters; + } + + container external-interface { + description + "Configuration for the external, internet-facing IPv4 + interface."; + + leaf mtu { + type uint16; + default 1460; + description + "Maximum packet size to send on the IPv4 interface."; + } + + leaf mru { + type uint16; + default 1460; + description + "Maximum packet size to receive on the IPv4 interface."; + } + + uses traffic-filters; + uses icmp-policy; + uses error-rate-limiting; + uses reassembly; + + + } + + container internal-interface { + description + "Configuration for the internal IPv6 interface."; + + leaf mtu { + type uint16; + default 1500; + description + "Maximum packet size to sent on the IPv6 interface."; + } + + leaf mru { + type uint16; + default 1460; + description + "Maximum packet size to recieve on the IPv6 interface."; + } + + leaf flow-label { + type uint32; + default 0; + description + "IPv6 flow label"; + } + + uses traffic-filters; + uses icmp-policy; + uses vlan-tagging; + uses error-rate-limiting; + uses reassembly; + + leaf hairpinning { + type boolean; + default true; + description + "Indicates whether to support hairpinning of traffic between + two B4s."; + } + } + + container binding-table { + description + "A collection of softwires (tunnels), along with a description + of the IPv4 and IPv6 addresses handled by the lwAFTR."; + + list softwire { + key "ipv4 psid"; + + leaf ipv4 { + type inet:ipv4-address; + mandatory true; + description + "Public IPv4 address of the softwire."; + } + + leaf padding { + type uint16; + default 0; + } + + leaf br-address { + type inet:ipv6-address; + mandatory true; + description + "The B4-facing address of the lwAFTR for this softwire."; + } + + leaf b4-ipv6 { + type inet:ipv6-address; + mandatory true; + description + "B4 address."; + } + + leaf psid { + type uint16; + mandatory true; + description "Port set ID."; + } + + container port-set { + description + "The set of IPv4 addresses managed by the lwAFTR, along with + the way in which those IPv4 addresses share ports. A PSID map + entry associates a PSID length and reserved-ports-bit-count + with each IPv4 address served by the lwAFTR. + + The lightweight 4-over-6 architecture supports sharing of + IPv4 addresses by partitioning the space of TCP/UDP/ICMP + ports into disjoint \"port sets\". Each softwire associated + with an IPv4 address corresponds to a different set of ports + on that address. The way that the ports are partitioned is + specified in RFC 7597: each address has an associated set + of parameters that specifies how to compute a \"port set + identifier\" (PSID) from a given port. + + 0 1 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 + +-----------+-----------+-------+ + Ports in | A | PSID | j | + the CE port set | > 0 | | | + +-----------+-----------+-------+ + | a bits | k bits |m bits | + + Figure 2: Structure of a Port-Restricted Port Field + + Source: http://tools.ietf.org/html/rfc7597#section-5.1 + + We find the specification's names to be a bit obtuse, so we + refer to them using the following names: + + a bits = reserved-ports-bit-count. + k bits = psid-length. + m bits = shift. + + The shift parameter is calculated from psid-length and + reserved-ports-bit-count. The calculation performed to + get the value of shift is: + + shift = 16 - psid-length - reserved-ports-bit-count"; + + leaf psid-length { + type uint8 { range 0..16; } + mandatory true; + description + "The number of bits devoted to the PSID in the port map. + If the psid-length is N, then the IPv4 address will be + shared 2^N ways. Note that psid-length, shift, and + reserved-ports-bit-count must add up to 16."; + } + + leaf reserved-ports-bit-count { + type uint8 { range 0..16; } + default 0; + description + "Reserve the lowest 2^N ports so that they map to no + softwire. This can be useful to prevent the low 1024 + ports (for example) from being mapped to customers. Note + that psid-length and shift must add up to less than or + equal to 16."; + } + } + } + + container version { + description + "Optional versioning for binding table. The vesioning information + will change on every update or change to the binding table."; + + leaf number { + type uint64; + description "Incremental version number."; + } + leaf date { + type yang:date-and-time; + description "Timestamp of last change."; + } + } + } + } + + uses state-counters; +} diff --git a/src/program/alarms/set_operator_state/README b/src/program/alarms/set_operator_state/README index f88e83eb13..b890c0a1f1 100644 --- a/src/program/alarms/set_operator_state/README +++ b/src/program/alarms/set_operator_state/README @@ -21,7 +21,7 @@ An OPERATOR-STATE can take the following values: 'none', 'ack', 'closed', Typical usage: -$ snabb alarms set-operator-state --schema snabb-softwire-v2 lwaftr resource arp-resolution ack +$ snabb alarms set-operator-state --schema snabb-softwire-v3 lwaftr resource arp-resolution ack See https://github.com/Igalia/snabb/blob/lwaftr/src/program/alarms/README.md for full documentation. diff --git a/src/program/config/common.lua b/src/program/config/common.lua index 067608adf8..689c7d6359 100644 --- a/src/program/config/common.lua +++ b/src/program/config/common.lua @@ -10,6 +10,7 @@ local file = require("lib.stream.file") local rpc = require("lib.yang.rpc") local yang = require("lib.yang.yang") local data = require("lib.yang.data") +local path_data = require("lib.yang.path_data") local path_resolver = require("lib.yang.path_data").resolver function show_usage(command, status, err_msg) @@ -38,7 +39,13 @@ end function data_parser(schema_name, path, is_config) local grammar = path_grammar(schema_name, path, is_config) - return data.data_parser_from_grammar(grammar) + local parser = data.data_parser_from_grammar(grammar) + local validator = path_data.consistency_checker_from_grammar(grammar) + return function (data) + local config = parser(data) + validator(config) + return config + end end function config_parser(schema_name, path) diff --git a/src/program/lwaftr/compile_configuration/README b/src/program/lwaftr/compile_configuration/README index 7f2e11bd0a..941da374eb 100644 --- a/src/program/lwaftr/compile_configuration/README +++ b/src/program/lwaftr/compile_configuration/README @@ -6,7 +6,7 @@ Usage: compile-configuration LWAFTR.CONF [LWAFTR.O] Validate and compile a configuration file. A lwAFTR configuration file follows the schema described in -`lib/yang/snabb-softwire-v2.yang`. It consists of several containers +`lib/yang/snabb-softwire-v3.yang`. It consists of several containers such as `binding-table`, `external-interface` and `internal-interface` and `instance`. diff --git a/src/program/lwaftr/compile_configuration/compile_configuration.lua b/src/program/lwaftr/compile_configuration/compile_configuration.lua index 04fe1e1259..62f3e37bfc 100644 --- a/src/program/lwaftr/compile_configuration/compile_configuration.lua +++ b/src/program/lwaftr/compile_configuration/compile_configuration.lua @@ -19,7 +19,7 @@ end function run(args) local filein, fileout = parse_args(args) local success, err = pcall(yang.load_configuration, filein, - {schema_name='snabb-softwire-v2', compiled_filename=fileout}) + {schema_name='snabb-softwire-v3', compiled_filename=fileout}) if not success then print(tostring(err)) main.exit(1) diff --git a/src/program/lwaftr/counters.lua b/src/program/lwaftr/counters.lua index ac04964023..1dd6f01d4d 100644 --- a/src/program/lwaftr/counters.lua +++ b/src/program/lwaftr/counters.lua @@ -8,7 +8,7 @@ local S = require('syscall') function counter_names () local names = {} - local schema = schema.load_schema_by_name('snabb-softwire-v2') + local schema = schema.load_schema_by_name('snabb-softwire-v3') for k, node in pairs(schema.body['softwire-state'].body) do if node.kind == 'leaf' then names[k] = data.normalize_id(k) @@ -18,7 +18,7 @@ function counter_names () end function read_counters (pid) - local reader = state.state_reader_from_schema_by_name('snabb-softwire-v2') + local reader = state.state_reader_from_schema_by_name('snabb-softwire-v3') local s = reader(state.counters_for_pid(pid or S.getpid())) local ret = {} for k, id in pairs(counter_names()) do diff --git a/src/program/lwaftr/doc/CHANGELOG.md b/src/program/lwaftr/doc/CHANGELOG.md index 6635173ce4..8d4c66cfd6 100644 --- a/src/program/lwaftr/doc/CHANGELOG.md +++ b/src/program/lwaftr/doc/CHANGELOG.md @@ -1,5 +1,71 @@ # Change Log +## [2022.01.13] + +### Notable changes + + * Support for XDP, AVF, and Mellanox drivers + + * Restore support for bump-in-the-wire operation + + * New updated lwAFTR YANG schema: `snabb-softwire-v3.yang`. + lwAFTR can now operate on >2 CPU cores + + * Add statistics counters for ICMP, ARP, and NDP + + * Fragmenter/defragmenter can now handle padded packets (bug fix) + + * NDP app now sends correct neighbot advertisements (bug fix) + + * Fix a parsing bug in `lib.yang` where nested default values of leaves where not set + + * Fix a bug in `lib.numa` where it could not gracefully handle the inability to read a CPU performance governor + +## [2019.06.02] + +### Notable changes + + * Fix `snabb top` to correctly display per-worker statistics for + instances of the lwAFTR running with receive-side scaling (RSS). + See https://github.com/Igalia/snabb/pull/1237. + + * Fix a problem related to an interaction between late trace + compilation and the ingress drop monitor. + + For context, Snabb uses LuaJIT, which is a just-in-time compiler. + LuaJIT compiles program segments called traces. Traces can jump to + each other, and thereby form a graph. The shape of the trace graph + can have important performance impacts on a network function, but + building the optimal graph shape is fundamentally hard. Usually + LuaJIT does a good job, but if a network function is dropping + packets, Snabb's "ingress drop monitor" will ask LuaJIT to re-learn + the graph of traces, in the hopes that this self-healing process will + fix the packet loss situation. + + Unfortunately, the self-healing process has some poor interactions + with so-called "long tail" traces -- traces that aren't taking an + important amount of time, but which LuaJIT might decide to compile a + few seconds into the running of a network function. Compiling a + trace can cause a latency spike and dropped packets, so the work of + compiling these long-tail traces can in fact be interpreted as a + packet loss situation, thereby triggering the self-healing process, + leading to a pathologically repeating large packet loss situation. + + The right answer is for LuaJIT to avoid the latency cost for + long-tail trace compilation. While this might make long-tail traces + run not as fast as they would if they were compiled, these traces + take so little time anyway that it doesn't matter enough to pay the + cost of trace compilation. + + See https://github.com/Igalia/snabb/pull/1236 and + https://github.com/Igalia/snabb/pull/1239 for full details. + + * Disable profiling by default. The version of LuaJIT that Snabb uses + includes a facility for online profiling of network functions. This + facility is low-overhead but not no-overhead. We have disabled it by + default on the lwAFTR; it can be enabled by passing the --profile + option. See https://github.com/Igalia/snabb/pull/1238. + ## [2019.06.01] ### Notable changes diff --git a/src/program/lwaftr/doc/configuration.md b/src/program/lwaftr/doc/configuration.md index 4a01d68a30..b929f25a26 100644 --- a/src/program/lwaftr/doc/configuration.md +++ b/src/program/lwaftr/doc/configuration.md @@ -2,7 +2,7 @@ The lwAFTR's configuration is modelled by a [YANG](https://tools.ietf.org/html/rfc6020) schema, -[snabb-softwire-v2](../../../lib/yang/snabb-softwire-v2.yang). +[snabb-softwire-v3](../../../lib/yang/snabb-softwire-v3.yang). The lwAFTR takes its configuration from the user in the form of a text file. That file's grammar is derived from the YANG schema; see the @@ -120,11 +120,10 @@ softwire-config { The lwaftr will spawn a number of worker processes that perform packet forwarding. Each `queue` statement in the configuration corresponds to one process servicing one RSS queue on one or two network devices. For -on-a-stick operation, only the `device` leaf that is part of the -`instance` leaf will be specified. For bump-in-the-wire operation, the -`instance` device will handle IPv6 traffic, and the `device` specified -in the `external-interface` that's part of the `queue` will handle IPv4 -traffic. +on-a-stick operation, only the `device` leaf will be specified. +For bump-in-the-wire operation, `device` will handle IPv6 traffic, and +IPv4 traffic will be handled on the device specified in the +`external-device` leaf. The `external-interface` define parameters around the IPv4 interface that communicates with the internet and the `internal-interface` section @@ -151,7 +150,7 @@ the given *PID* to reload its configuration from the given file. ## In-depth configuration explanation See the embedded descriptions in the -[snabb-softwire-v2](../../../lib/yang/snabb-softwire-v2.yang) schema +[snabb-softwire-v3](../../../lib/yang/snabb-softwire-v3.yang) schema file. ## Binding tables @@ -309,10 +308,10 @@ example, here's a bump-in-the-wire configuration with two RSS workers: ``` instance { device 83:00.0; + external-device 83:00.1; queue { id 0; external-interface { - device 83:00.1; ip 10.10.10.10; mac 56:56:56:56:56:56; next-hop { mac 02:68:68:68:68:68; } @@ -326,7 +325,6 @@ example, here's a bump-in-the-wire configuration with two RSS workers: queue { id 1; external-interface { - device 83:00.1; ip 10.10.10.10; mac 56:56:56:56:56:56; next-hop { mac 02:68:68:68:68:68; } @@ -341,8 +339,7 @@ example, here's a bump-in-the-wire configuration with two RSS workers: ``` These queues are configured on the `83:00.0` instance, and because the -queues have a different device configured on the `external-interface` -containers, that makes this configuration a bump-in-the-wire +instance specifies an `external-device` this is a bump-in-the-wire configuration. The two queues are identical with the exception of their `id` fields. Incoming IPv6 traffic on `83:00.0` and IPv4 traffic on `83:00.1` will be evenly split between these two worker processes using @@ -384,7 +381,7 @@ lwAFTR is addressable using the [`ietf-softwire-br`](../../../lib/yang/ietf-softwire-br.yang) YANG schema. The lwAFTR also has a "native" schema that exposes more configuration information, -[`snabb-softwire-v2`](../../../lib/yang/snabb-softwire-v2.yang). Pass +[`snabb-softwire-v3`](../../../lib/yang/snabb-softwire-v3.yang). Pass the `-s` argument to the `snabb config` tools to specify a non-default YANG schema. @@ -393,7 +390,7 @@ next-hop address of the external interface on lwaftr instance `lwaftr`'s queue `0` on device `83:00.0`: ``` -$ snabb config set -s snabb-softwire-v2 lwaftr \ +$ snabb config set -s snabb-softwire-v3 lwaftr \ /softwire-config/instance[device=83:00.0]/queue[id=0]/external-interface/next-hop/mac \ 02:02:02:02:02:02 ``` @@ -402,7 +399,7 @@ $ snabb config set -s snabb-softwire-v2 lwaftr \ Firstly, we suggest getting a lwAFTR configuration working that runs on only one interface and one queue. Once you have that working, do a -`snabb config get -s snabb-softwire-v2 lwaftr /softwire-config/instance` +`snabb config get -s snabb-softwire-v3 lwaftr /softwire-config/instance` to get the `instance` configuration for the `lwaftr` instance. You'll get something like this: @@ -421,7 +418,7 @@ So to add another device, you can just paste that into a file, change the devices, and then do: ``` -$ snabb config add -s snabb-softwire-v2 lwaftr \ +$ snabb config add -s snabb-softwire-v3 lwaftr \ /softwire-config/instance < my-instance.file.conf ``` @@ -442,14 +439,14 @@ like you think they should be. To remove a queue, use `snabb config remove`: ``` -$ snabb config remove -s snabb-softwire-v2 lwaftr \ +$ snabb config remove -s snabb-softwire-v3 lwaftr \ /softwire-config/instance[device=XX:XX.X]/queue[id=ID] ``` Likewise you can remove instances this way: ``` -$ snabb config remove -s snabb-softwire-v2 lwaftr \ +$ snabb config remove -s snabb-softwire-v3 lwaftr \ /softwire-config/instance[device=XX:XX.X] ``` diff --git a/src/program/lwaftr/generate_configuration/README b/src/program/lwaftr/generate_configuration/README index f5020e88a2..8be6bcc328 100644 --- a/src/program/lwaftr/generate_configuration/README +++ b/src/program/lwaftr/generate_configuration/README @@ -5,7 +5,7 @@ snabb lwaftr generate-configuration Output filename (snabb-softwire-v2 configuration file). + --output Output filename (snabb-softwire-v3 configuration file). Examples: diff --git a/src/program/lwaftr/migrate_configuration/migrate_configuration.lua b/src/program/lwaftr/migrate_configuration/migrate_configuration.lua index af33e23f5a..a960c936c8 100644 --- a/src/program/lwaftr/migrate_configuration/migrate_configuration.lua +++ b/src/program/lwaftr/migrate_configuration/migrate_configuration.lua @@ -12,6 +12,7 @@ local yang = require('lib.yang.yang') local binding_table = require("apps.lwaftr.binding_table") local Parser = require("program.lwaftr.migrate_configuration.conf_parser").Parser local data = require('lib.yang.data') +local schema = require('lib.yang.schema') local br_address_t = ffi.typeof('uint8_t[16]') local SOFTWIRE_TABLE_LOAD_FACTOR = 0.4 @@ -436,13 +437,38 @@ local function remove_psid_map(conf) return conf end +local function v3_migration(src, conf_file) + local v2_schema = yang.load_schema_by_name("snabb-softwire-v2") + local v3_schema = yang.load_schema_by_name("snabb-softwire-v3") + local conf = yang.load_config_for_schema( + v2_schema, mem.open_input_string(src, conf_file)) + + -- Move leaf external-interface/device up as external-device. + for device, instance in pairs(conf.softwire_config.instance) do + for id, queue in pairs(instance.queue) do + if queue.external_interface.device then + if instance.external_device then + io.stderr:write('Multiple external devices detected; '.. + 'manual verification needed.\n') + io.stderr:flush() + end + instance.external_device = queue.external_interface.device + queue.external_interface.device = nil + end + end + end + + return config_to_string(v3_schema, conf) +end + local function multiprocess_migration(src, conf_file) local device = "IPv6 PCI Address" local ex_device = "IPv4 PCI address" -- We should build up a hybrid schema from parts of v1 and v2. local v1_schema = yang.load_schema_by_name("snabb-softwire-v1") - local hybridscm = yang.load_schema_by_name("snabb-softwire-v2") + -- Make sure we load a fresh schema, as not to mutate a memoized copy + local hybridscm = schema.load_schema(schema.load_schema_source_by_name("snabb-softwire-v2")) local v1_external = v1_schema.body["softwire-config"].body["external-interface"] local v1_internal = v1_schema.body["softwire-config"].body["internal-interface"] local external = hybridscm.body["softwire-config"].body["external-interface"] @@ -473,9 +499,9 @@ local function multiprocess_migration(src, conf_file) -- Build up the instance list local instance = { - [device] = {queue = cltable.new({ key_type = queue_key }),}, + [device] = {queue={}}, } - local key = ffi.new(queue_key, 0) + local key = 0 local value = { external_interface = { device = ex_device, @@ -508,7 +534,7 @@ local function multiprocess_migration(src, conf_file) else error("One or both of next-hop values must be provided.") end - cltable.set(instance[device].queue, key, value) + instance[device].queue[key] = value conf.softwire_config.instance = instance -- Remove the fields which no longer should exist @@ -521,7 +547,7 @@ local function multiprocess_migration(src, conf_file) conf.softwire_config.external_interface.next_hop = nil conf.softwire_config.external_interface.vlan_tag = nil - return config_to_string('snabb-softwire-v2', conf) + return config_to_string(hybridscm, conf) end local function v2_migration(src, conf_file) @@ -529,7 +555,9 @@ local function v2_migration(src, conf_file) -- switch over to v2 of snabb-softwire config. local v1_schema = yang.load_schema_by_name("snabb-softwire-v1") local v1_binding_table = v1_schema.body["softwire-config"].body["binding-table"] - local hybridscm = yang.load_schema_by_name("snabb-softwire-v2") + + -- Make sure we load a fresh schema, as not to mutate a memoized copy + local hybridscm = schema.load_schema(schema.load_schema_source_by_name("snabb-softwire-v2")) local binding_table = hybridscm.body["softwire-config"].body["binding-table"] -- Add the schema from v1 that we need to convert them. @@ -547,6 +575,9 @@ local function v2_migration(src, conf_file) -- Remove the mandatory requirement on softwire.br-address for the migration binding_table.body["softwire"].body["br-address"].mandatory = false + -- Remove the mandatory requirement on softwire.port-set.psid-length for the migration + binding_table.body["softwire"].body["port-set"].body["psid-length"].mandatory = false + local conf = yang.load_config_for_schema( hybridscm, mem.open_input_string(src, conf_file)) @@ -590,13 +621,18 @@ local function migrate_2017_07_01(conf_file, src) return multiprocess_migration(src, conf_file) end +local function migrate_2022_01_19(conf_file, src) + return v3_migration(src, conf_file) +end + local migrations = { {version='legacy', migrator=migrate_legacy}, {version='3.0.1', migrator=migrate_3_0_1}, {version='3.0.1.1', migrator=migrate_3_0_1bis}, {version='3.2.0', migrator=migrate_3_2_0}, - {version='2017.07.01',migrator=migrate_2017_07_01} + {version='2017.07.01',migrator=migrate_2017_07_01}, + {version='2022.01.19',migrator=migrate_2022_01_19}, } @@ -617,6 +653,7 @@ function run(args) local conf = io.open(conf_file, "r"):read("*a") for _, migration in next,migrations,start do + io.stderr:write(("-> %s migration\n"):format(migration.version)) conf = migration.migrator(conf_file, conf) -- Prompt the garbage collection to do a full collect after each migration collectgarbage() diff --git a/src/program/lwaftr/run/README b/src/program/lwaftr/run/README index 62d0eef32e..7396df7db9 100644 --- a/src/program/lwaftr/run/README +++ b/src/program/lwaftr/run/README @@ -3,7 +3,7 @@ Usage: run --help Required arguments: -c CONF, --conf CONF Use configuration from the file CONF. - See the snabb-softwire-v2 YANG module + See the snabb-softwire-v3 YANG module for full documentation. Optional arguments: @@ -29,6 +29,8 @@ Optional arguments: -i, --virtio Interpret PCI addresses as referring to virtio-net interfaces instead of auto-detecting the appropriate driver. + --xdp Use Linux interfaces via XDP. + (Incompatible with --on-a-stick.) -r SIZE, --ring-buffer-size SIZE Set NIC receive buffer size. The default is driver-dependent. See @@ -49,11 +51,10 @@ Optional arguments: Optional arguments for debugging and profiling: -v Verbose (repeat for more verbosity). + --profile Enable the low-overhead sampling + profiler. -t FILE, --trace FILE Record a trace of any run-time "snabb config" commands to FILE. - -jv, -jv=FILE Print out when traces are recorded. - -jp, -jp=MODE,FILE Profile the system by method. - -jtprof Profile the system by trace. -b FILENAME, --bench-file FILENAME Write any benchmarking data to FILENAME. -D SECONDS Stop after SECONDS, for debugging diff --git a/src/program/lwaftr/run/run.lua b/src/program/lwaftr/run/run.lua index 209d724d2b..e571448859 100644 --- a/src/program/lwaftr/run/run.lua +++ b/src/program/lwaftr/run/run.lua @@ -39,9 +39,7 @@ local function migrate_device_on_config(config, v4, v6) end if v6 then - for id, queue in pairs(instance.queue) do - queue.external_interface.device = v6 - end + instance.external_device = v6 end end @@ -50,12 +48,16 @@ function parse_args(args) local conf_file, v4, v6 local ring_buffer_size local opts = { verbosity = 0 } - local scheduling = { ingress_drop_monitor = 'flush' } + local scheduling = { ingress_drop_monitor = 'flush', profile = false } local handlers = {} function handlers.n (arg) opts.name = assert(arg) end function handlers.v () opts.verbosity = opts.verbosity + 1 end function handlers.t (arg) opts.trace = assert(arg) end function handlers.i () opts.virtio_net = true end + handlers['xdp'] = function(arg) + opts['xdp'] = true + scheduling.enable_xdp = {} -- XXX - maybe configure num_chunks here? + end function handlers.D (arg) opts.duration = assert(tonumber(arg), "duration must be a number") assert(opts.duration >= 0, "duration can't be negative") @@ -100,13 +102,14 @@ function parse_args(args) .." (valid values: flush, warn, off)") end end - function handlers.j(arg) scheduling.j = arg end + function handlers.profile() scheduling.profile = true end function handlers.h() show_usage(0) end - lib.dogetopt(args, handlers, "b:c:vD:yhir:n:j:t:", + lib.dogetopt(args, handlers, "b:c:vD:yhir:n:t:", { conf = "c", name = "n", cpu = 1, v4 = 1, v6 = 1, ["on-a-stick"] = 1, virtio = "i", ["ring-buffer-size"] = "r", + ["xdp"] = 0, ["real-time"] = 0, mirror = 1, ["ingress-drop-monitor"] = 1, - verbose = "v", trace = "t", ["bench-file"] = "b", + verbose = "v", trace = "t", ["bench-file"] = "b", ["profile"] = 0, duration = "D", hydra = "y", help = "h" }) if ring_buffer_size ~= nil then if opts.virtio_net then @@ -147,17 +150,29 @@ function run(args) -- anything defined in the config. if opts.name then conf.softwire_config.name = opts.name end + -- If we’re using XDP, setup interfaces here + if opts.xdp then + setup.xdp_ifsetup(conf) + end + local function setup_fn(graph, lwconfig) -- If --virtio has been specified, always use this. if opts.virtio_net then return setup_fn(graph, lwconfig, 'inetNic', 'b4sideNic') end - -- If instance has external-interface.device configure as bump-in-the-wire + -- If --xdp has been specified, always use this. + if opts.xdp then + return setup.load_xdp(graph, lwconfig, 'inetNic', 'b4sideNic', + opts.ring_buffer_size) + end + + -- If instance has external-device configure as bump-in-the-wire -- otherwise configure it in on-a-stick mode. - local device, id, queue = lwutil.parse_instance(lwconfig) - if not lwutil.is_on_a_stick(device, queue) then - if lib.is_iface(queue.external_interface.device) then + local device = lwutil.parse_instance(lwconfig) + local instance = lwconfig.softwire_config.instance[device] + if not lwutil.is_on_a_stick(lwconfig, device) then + if lib.is_iface(instance.external_device) then return setup.load_kernel_iface(graph, lwconfig, 'inetNic', 'b4sideNic') else return setup.load_phy(graph, lwconfig, 'inetNic', 'b4sideNic', diff --git a/src/program/lwaftr/setup.lua b/src/program/lwaftr/setup.lua index b8bb51850c..2387fbe0f3 100644 --- a/src/program/lwaftr/setup.lua +++ b/src/program/lwaftr/setup.lua @@ -21,6 +21,7 @@ local vlan = require("apps.vlan.vlan") local pci = require("lib.hardware.pci") local cltable = require("lib.cltable") local ipv4 = require("lib.protocol.ipv4") +local ipv6 = require("lib.protocol.ipv6") local ethernet = require("lib.protocol.ethernet") local ipv4_ntop = require("lib.yang.util").ipv4_ntop local binary = require("lib.yang.binary") @@ -86,7 +87,7 @@ function lwaftr_app(c, conf) { address = convert_ipv4(iexternal_interface.ip) }) config.app(c, "icmpechov6", ipv6_echo.ICMPEcho, { address = iinternal_interface.ip }) - config.app(c, "lwaftr", lwaftr.LwAftr, conf) + config.app(c, "lwaftr", lwaftr.LwAftr, lwutil.select_instance(conf)) config.app(c, "fragmenterv4", ipv4_fragment.Fragmenter, { mtu=gexternal_interface.mtu }) config.app(c, "fragmenterv6", ipv6_fragment.Fragmenter, @@ -95,14 +96,16 @@ function lwaftr_app(c, conf) { self_ip = iinternal_interface.ip, self_mac = iinternal_interface.mac, next_mac = iinternal_interface.next_hop.mac, - shared_next_mac_key = "group/"..device.."-ipv6-next-mac", + shared_next_mac_key = ("group/%s-ipv6-next-mac-%d"):format( + device, iinternal_interface.vlan_tag or 0), next_ip = iinternal_interface.next_hop.ip, alarm_notification = conf.alarm_notification }) config.app(c, "arp", arp.ARP, { self_ip = convert_ipv4(iexternal_interface.ip), self_mac = iexternal_interface.mac, next_mac = iexternal_interface.next_hop.mac, - shared_next_mac_key = "group/"..device.."-ipv4-next-mac", + shared_next_mac_key = ("group/%s-ipv4-next-mac-%d"):format( + device, iexternal_interface.vlan_tag or 0), next_ip = convert_ipv4(iexternal_interface.next_hop.ip), alarm_notification = conf.alarm_notification }) @@ -206,8 +209,8 @@ end function load_kernel_iface (c, conf, v4_nic_name, v6_nic_name) local RawSocket = require("apps.socket.raw").RawSocket - local v4_iface, id, queue = lwutil.parse_instance(conf) - local v6_iface = queue.external_interface.dev_info + local v6_iface, id, queue = lwutil.parse_instance(conf) + local v4_iface = queue.external_interface.dev_info local dev_info = {rx = "rx", tx = "tx"} lwaftr_app(c, conf, v6_iface) @@ -219,39 +222,255 @@ function load_kernel_iface (c, conf, v4_nic_name, v6_nic_name) link_sink(c, v4_nic_name..'.'..dev_info.rx, v6_nic_name..'.'..dev_info.rx) end +local intel_mp = require("apps.intel_mp.intel_mp") +local connectx = require("apps.mellanox.connectx") +local intel_avf = require("apps.intel_avf.intel_avf") + +local function cmd(...) + local cmd + for _, part in ipairs({...}) do + if not cmd then cmd = part + else cmd = cmd.." "..part end + end + print("shell:", cmd) + local status = os.execute(cmd) + assert(status == 0, ("Command failed with return code %d"):format(status)) +end + +function config_intel_mp(c, name, opt) + config.app(c, name, intel_mp.driver, { + pciaddr=opt.pci, + vmdq=true, -- Needed to enable MAC filtering/stamping. + rxq=opt.queue, + txq=opt.queue, + poolnum=0, + macaddr=ethernet:ntop(opt.mac), + vlan=opt.vlan, + rxcounter=opt.queue, + txcounter=opt.queue, + ring_buffer_size=opt.ring_buffer_size + }) + return name..'.input', name..'.output' +end + +function config_connectx(c, name, opt, lwconfig) + local function queue_id (opt, queue) + return ("%s.%s.%s"):format(ethernet:ntop(opt.mac), + opt.vlan or opt.vlan_tag, + queue or opt.queue) + end + local device = lwutil.parse_instance(lwconfig) + local queues = {} + for id, queue in pairs(lwconfig.softwire_config.instance[device].queue) do + queues[#queues+1] = { + id = queue_id(queue.external_interface, id), + mac = ethernet:ntop(queue.external_interface.mac), + vlan = queue.external_interface.vlan_tag + } + queues[#queues+1] = { + id = queue_id(queue.internal_interface, id), + mac = ethernet:ntop(queue.internal_interface.mac), + vlan = queue.internal_interface.vlan_tag + } + end + if lwutil.is_lowest_queue(lwconfig) then + config.app(c, "ConnectX_"..opt.pci:gsub("[%.:]", "_"), connectx.ConnectX, { + pciaddress = opt.pci, + queues = queues + }) + end + config.app(c, name, connectx.IO, { + pciaddress = opt.pci, + queue = queue_id(opt) + }) + local input, output = name..'.input', name..'.output' + if opt.vlan then + config.app(c, name.."_tag", vlan.Tagger, { tag=opt.vlan }) + config.link(c, name.."_tag.output -> "..input) + config.app(c, name.."_untag", vlan.Untagger, { tag=opt.vlan }) + config.link(c, output.." -> "..name.."_untag.input") + input, output = name.."_tag.input", name.."_untag.output" + end + return input, output +end + +function config_intel_avf(c, name, opt, lwconfig) + local nqueues = lwutil.num_queues(lwconfig) + if lwutil.is_lowest_queue(lwconfig) then + local _, _, queue = lwutil.parse_instance(lwconfig) + local v6_mcast = ipv6:solicited_node_mcast(queue.internal_interface.ip) + local mac_mcast = ethernet:ipv6_mcast(v6_mcast) + config.app(c, "IntelAVF_"..opt.pci:gsub("[%.:]", "_"), intel_avf.Intel_avf, { + pciaddr = opt.pci, + vlan = opt.vlan, + nqueues = nqueues, + macs = {mac_mcast} + }) + end + config.app(c, name, intel_avf.IO, { + pciaddr = opt.pci, + queue = opt.queue + }) + return name..'.input', name..'.output' +end + +function config_intel_avf_pf(c, name, opt, lwconfig) + local path = "/sys/bus/pci/devices/"..pci.qualified(opt.pci) + local ifname = lib.firstfile(path.."/net") + assert(ifname and lib.can_write(path.."/sriov_numvfs"), + "Unsupported device: "..opt.pci) + local vf = 0 -- which vf should this interface be on? + local numvf = 1 -- how many vfs do we need to create on the pf? + local vfmac = {} -- MACs to assign to vfs + local device, _, queue = lwutil.parse_instance(lwconfig) + if lwutil.is_on_a_stick(lwconfig, device) then + numvf = 2 + vfmac[0] = queue.external_interface.mac + vfmac[1] = queue.internal_interface.mac + if ethernet:ntop(opt.mac) == ethernet:ntop(queue.internal_interface.mac) then + vf = 1 + end + else + vfmac[0] = opt.mac + end + if lwutil.is_lowest_queue(lwconfig) then + print("Setting "..path.."/sriov_numvfs = "..numvf) + assert(lib.writefile(path.."/sriov_numvfs", numvf), + "Failed to allocate VFs.") + cmd('ip link set up', 'dev', ifname) + cmd('ip link set', ifname, 'vf', 0, 'mac', ethernet:ntop(vfmac[0])) + cmd('ip link set', ifname, 'vf', 0, 'spoofchk off') + pcall(cmd, 'ip link set', ifname, 'vf', 0, 'trust on') + if numvf == 2 then + cmd('ip link set', ifname, 'vf', 1, 'mac', ethernet:ntop(vfmac[1])) + cmd('ip link set', ifname, 'vf', 1, 'spoofchk off') + pcall(cmd, 'ip link set', ifname, 'vf', 1, 'trust on') + end + end + local vfpci = lib.basename(lib.readlink(path.."/virtfn"..vf)) + local avf_opt = { + pci = vfpci, + queue = opt.queue, + vlan = opt.vlan, + ring_buffer_size = opt.ring_buffer_size + } + return config_intel_avf(c, name, avf_opt, lwconfig) +end + +function config_nic(c, name, driver, opt, lwconfig) + local config_fn = { [intel_mp.driver] = config_intel_mp, + [connectx.driver] = config_connectx, + [intel_avf.driver] = config_intel_avf, + ['maybe_avf?'] = config_intel_avf_pf} + local f = assert(config_fn[(driver and require(driver).driver) or 'maybe_avf?'], + "Unsupported device: "..opt.pci) + return f(c, name, opt, lwconfig) +end + function load_phy(c, conf, v4_nic_name, v6_nic_name, ring_buffer_size) - local v4_pci, id, queue = lwutil.parse_instance(conf) - local v6_pci = queue.external_interface.device + local v6_pci, id, queue = lwutil.parse_instance(conf) + local v4_pci = conf.softwire_config.instance[v6_pci].external_device local v4_info = pci.device_info(v4_pci) local v6_info = pci.device_info(v6_pci) validate_pci_devices({v4_pci, v6_pci}) lwaftr_app(c, conf, v4_pci) - config.app(c, v4_nic_name, require(v4_info.driver).driver, { - pciaddr=v4_pci, - vmdq=true, -- Needed to enable MAC filtering/stamping. - rxq=id, - txq=id, - poolnum=0, - vlan=queue.external_interface.vlan_tag, - rxcounter=id, - txcounter=id, - ring_buffer_size=ring_buffer_size, - macaddr=ethernet:ntop(queue.external_interface.mac)}) - config.app(c, v6_nic_name, require(v6_info.driver).driver, { - pciaddr=v6_pci, - vmdq=true, -- Needed to enable MAC filtering/stamping. - rxq=id, - txq=id, - poolnum=0, - vlan=queue.internal_interface.vlan_tag, - rxcounter=id, - txcounter=id, - ring_buffer_size=ring_buffer_size, - macaddr = ethernet:ntop(queue.internal_interface.mac)}) + local v4_nic_opt = { + pci = v4_pci, + queue = id, + mac = queue.external_interface.mac, + vlan = queue.external_interface.vlan_tag, + ring_buffer_size = ring_buffer_size + } + local v4_input, v4_output = + config_nic(c, v4_nic_name, v4_info.driver, v4_nic_opt, conf) + + local v6_nic_opt = { + pci = v6_pci, + queue = id, + mac = queue.internal_interface.mac, + vlan = queue.internal_interface.vlan_tag, + ring_buffer_size = ring_buffer_size + } + local v6_input, v6_output = + config_nic(c, v6_nic_name, v6_info.driver, v6_nic_opt, conf) + + link_source(c, v4_output, v6_output) + link_sink(c, v4_input, v6_input) +end + +function load_xdp(c, conf, v4_nic_name, v6_nic_name, ring_buffer_size) + local v6_device, id, queue = lwutil.parse_instance(conf) + local v4_device = conf.softwire_config.instance[v6_device].external_device + assert(lib.is_iface(v4_device), v4_nic_name..": "..v4_device.." is not a Linux interface") + assert(lib.is_iface(v6_device), v6_nic_name..": "..v6_device.." is not a Linux interface") + assert(not lwutil.is_on_a_stick(conf, v6_device), + "--xdp does not support on-a-stick configuration") + + lwaftr_app(c, conf) + + config.app(c, v4_nic_name, require("apps.xdp.xdp").driver, { + ifname=v4_device, + queue=id}) + config.app(c, v6_nic_name, require("apps.xdp.xdp").driver, { + ifname=v6_device, + queue=id}) + + local v4_src, v6_src = v4_nic_name..'.output', v6_nic_name..'.output' + local v4_sink, v6_sink = v4_nic_name..'.input', v6_nic_name..'.input' + + -- Linux removes VLAN tag, but we have to tag outgoing packets + if queue.external_interface.vlan_tag then + config.app(c, "tagv4", vlan.Tagger, + { tag=queue.external_interface.vlan_tag }) + config.link(c, "tagv4.output -> "..v4_sink) + v4_sink = "tagv4.input" + end + if queue.internal_interface.vlan_tag then + config.app(c, "tagv6", vlan.Tagger, + { tag=queue.internal_interface.vlan_tag }) + config.link(c, "tagv6.output -> "..v6_sink) + v6_sink = "tagv6.input" + end + + link_source(c, v4_src, v6_src) + link_sink(c, v4_sink, v6_sink) +end - link_source(c, v4_nic_name..'.'..v4_info.tx, v6_nic_name..'.'..v6_info.tx) - link_sink(c, v4_nic_name..'.'..v4_info.rx, v6_nic_name..'.'..v6_info.rx) +function xdp_ifsetup(conf) + for idevice, instance in pairs(conf.softwire_config.instance) do + local edevice = instance.external_device + local icfg, ecfg + local nqueues = 0 + for _, queue in pairs(instance.queue) do + nqueues = nqueues + 1 + if not icfg then icfg = queue.internal_interface + else assert(lib.equal(icfg, queue.internal_interface)) end + if not ecfg then ecfg = queue.external_interface + else assert(lib.equal(ecfg, queue.external_interface)) end + end + for qid in pairs(instance.queue) do + assert(qid < nqueues) + end + local function ifsetup(ifname, cfg, opts, ip_ntop) + cmd('ip link set down', 'dev', ifname) + cmd('ip address flush', 'dev', ifname) + cmd('ip link set address', ethernet:ntop(cfg.mac), 'dev', ifname) + cmd('ip link set arp off', 'dev', ifname) + cmd('ip link set broadcast', "ff:ff:ff:ff:ff:ff", 'dev', ifname) + cmd('ip link set multicast on', 'dev', ifname) + cmd('ip link set mtu', opts.mtu, 'dev', ifname) + cmd('ip address add', ip_ntop(cfg.ip), 'dev', ifname) + cmd('ethtool --set-channels', ifname, 'combined', nqueues) + cmd('ip link set up', 'dev', ifname) + end + print("Configuring internal interface for XDP...") + ifsetup(idevice, icfg, conf.softwire_config.internal_interface, + function (ip) return ipv6:ntop(ip) end) + print("Configuring external interface for XDP...") + ifsetup(edevice, ecfg, conf.softwire_config.external_interface, + ipv4_ntop) + end end function load_on_a_stick_kernel_iface (c, conf, args) @@ -292,7 +511,7 @@ end function load_on_a_stick(c, conf, args) local pciaddr, id, queue = lwutil.parse_instance(conf) local device = pci.device_info(pciaddr) - local driver = require(device.driver).driver + local driver = device.driver validate_pci_devices({pciaddr}) lwaftr_app(c, conf, pciaddr) local v4_nic_name, v6_nic_name, v4v6, mirror = args.v4_nic_name, @@ -310,17 +529,17 @@ function load_on_a_stick(c, conf, args) assert(queue.external_interface.vlan_tag == queue.internal_interface.vlan_tag) assert(ethernet:ntop(queue.external_interface.mac) == ethernet:ntop(queue.internal_interface.mac)) - config.app(c, 'nic', driver, { - pciaddr = pciaddr, - vmdq=true, -- Needed to enable MAC filtering/stamping. - rxq=id, - txq=id, - poolnum=0, - vlan=queue.external_interface.vlan_tag, - ring_buffer_size=args.ring_buffer_size, - rxcounter = id, - txcounter = id, - macaddr = ethernet:ntop(queue.external_interface.mac)}) + + local v4v6_nic_opt = { + pci = pciaddr, + queue = id, + mac = queue.external_interface.mac, + vlan = queue.internal_interface.vlan_tag, + ring_buffer_size = args.ring_buffer_size + } + local v4v6_input, v4v6_output = + config_nic(c, 'nic', driver, v4v6_nic_opt, conf) + if mirror then local Tap = require("apps.tap.tap").Tap local ifname = mirror @@ -332,43 +551,39 @@ function load_on_a_stick(c, conf, args) else config.app(c, v4v6, V4V6) end - config.link(c, 'nic.'..device.tx..' -> '..v4v6..'.input') - config.link(c, v4v6..'.output -> nic.'..device.rx) + config.link(c, v4v6_output..' -> '..v4v6..'.input') + config.link(c, v4v6..'.output -> '..v4v6_input) link_source(c, v4v6..'.v4', v4v6..'.v6') link_sink(c, v4v6..'.v4', v4v6..'.v6') else - config.app(c, v4_nic_name, driver, { - pciaddr = pciaddr, - vmdq=true, -- Needed to enable MAC filtering/stamping. - rxq=id, - txq=id, - poolnum=0, - vlan=queue.external_interface.vlan_tag, - ring_buffer_size=args.ring_buffer_size, - rxcounter = id, - txcounter = id, - macaddr = ethernet:ntop(queue.external_interface.mac)}) - config.app(c, v6_nic_name, driver, { - pciaddr = pciaddr, - vmdq=true, -- Needed to enable MAC filtering/stamping. - rxq=id, - txq=id, - poolnum=1, - vlan=queue.internal_interface.vlan_tag, - ring_buffer_size=args.ring_buffer_size, - rxcounter = id, - txcounter = id, - macaddr = ethernet:ntop(queue.internal_interface.mac)}) - - link_source(c, v4_nic_name..'.'..device.tx, v6_nic_name..'.'..device.tx) - link_sink(c, v4_nic_name..'.'..device.rx, v6_nic_name..'.'..device.rx) + local v4_nic_opt = { + pci = pciaddr, + queue = id, + mac = queue.external_interface.mac, + vlan = queue.external_interface.vlan_tag, + ring_buffer_size = args.ring_buffer_size + } + local v4_input, v4_output = + config_nic(c, v4_nic_name, driver, v4_nic_opt, conf) + local v6_nic_opt = { + pci = pciaddr, + queue = id, + mac = queue.internal_interface.mac, + vlan = queue.internal_interface.vlan_tag, + ring_buffer_size = args.ring_buffer_size + } + local v6_input, v6_output = + config_nic(c, v6_nic_name, driver, v6_nic_opt, conf) + + link_source(c, v4_output, v6_output) + link_sink(c, v4_input, v6_input) end end function load_virt(c, conf, v4_nic_name, v6_nic_name) - local v4_pci, id, queue = lwutil.parse_instance(conf) - local v6_pci = queue.external_device.device + local v6_pci, id, queue = lwutil.parse_instance(conf) + local v4_pci = conf.softwire_config.instance[v6_pci].external_device lwaftr_app(c, conf, device) validate_pci_devices({v4_pci, v6_pci}) @@ -612,25 +827,14 @@ end -- will get its own worker process. local function compute_worker_configs(conf) local ret = {} - local copier = binary.config_copier_for_schema_by_name('snabb-softwire-v2') + local copier = binary.config_copier_for_schema_by_name('snabb-softwire-v3') local make_copy = copier(conf) for device, queues in pairs(conf.softwire_config.instance) do for id, _ in pairs(queues.queue) do local worker_id = string.format('%s/%s', device, id) local worker_config = make_copy() - local instance = worker_config.softwire_config.instance - for other_device, queues in pairs(conf.softwire_config.instance) do - if other_device ~= device then - instance[other_device] = nil - else - for other_id, _ in pairs(queues.queue) do - if other_id ~= id then - instance[device].queue[other_id] = nil - end - end - end - end - ret[worker_id] = worker_config + local meta = {worker_config = {device=device, queue_id=id}} + ret[worker_id] = setmetatable(worker_config, {__index=meta}) end end return ret @@ -668,7 +872,7 @@ function ptree_manager(f, conf, manager_opts) local initargs = { setup_fn = setup_fn, initial_configuration = conf, - schema_name = 'snabb-softwire-v2', + schema_name = 'snabb-softwire-v3', default_schema = 'ietf-softwire-br', -- log_level="DEBUG" } diff --git a/src/program/lwaftr/tests/config-migrations/selftest.sh b/src/program/lwaftr/tests/config-migrations/selftest.sh index d854728fca..4ac6f1f575 100755 --- a/src/program/lwaftr/tests/config-migrations/selftest.sh +++ b/src/program/lwaftr/tests/config-migrations/selftest.sh @@ -1,4 +1,4 @@ -#!/bin/sh +#!/usr/bin/env bash # Attempt to migration from legacy to latest LEGACY_OUT=`./snabb lwaftr migrate-configuration -f legacy \ @@ -19,4 +19,4 @@ if [[ "$?" -ne "0" ]]; then echo "3.2.0 configuration migration failed (status code != 0)" echo "$V320_OUT" exit 1 -fi \ No newline at end of file +fi diff --git a/src/program/lwaftr/tests/data/counters/arp-for-next-hop.lua b/src/program/lwaftr/tests/data/counters/arp-for-next-hop.lua index c692e9d73e..10d952517d 100644 --- a/src/program/lwaftr/tests/data/counters/arp-for-next-hop.lua +++ b/src/program/lwaftr/tests/data/counters/arp-for-next-hop.lua @@ -2,4 +2,6 @@ return { ["memuse-ipv4-frag-reassembly-buffer"] = 728203264, ["memuse-ipv6-frag-reassembly-buffer"] = 11378176, ["out-ipv4-frag-not"] = 1, + ["out-arp-request-bytes"] = 42, + ["out-arp-request-packets"] = 1, } diff --git a/src/program/lwaftr/tests/data/counters/from-inet-ipv4-in-binding-big-packet-df-set-allow.lua b/src/program/lwaftr/tests/data/counters/from-inet-ipv4-in-binding-big-packet-df-set-allow.lua index bd28c5fd2a..a710f73ef3 100644 --- a/src/program/lwaftr/tests/data/counters/from-inet-ipv4-in-binding-big-packet-df-set-allow.lua +++ b/src/program/lwaftr/tests/data/counters/from-inet-ipv4-in-binding-big-packet-df-set-allow.lua @@ -8,8 +8,8 @@ return { ["in-ipv4-packets"] = 1, ["memuse-ipv4-frag-reassembly-buffer"] = 728203264, ["memuse-ipv6-frag-reassembly-buffer"] = 11378176, - ["out-icmpv4-bytes"] = 590, - ["out-icmpv4-packets"] = 1, + ["out-icmpv4-error-bytes"] = 590, + ["out-icmpv4-error-packets"] = 1, ["out-ipv4-bytes"] = 590, ["out-ipv4-frag-not"] = 1, ["out-ipv4-packets"] = 1, diff --git a/src/program/lwaftr/tests/data/counters/icmpv6-ping-and-reply.lua b/src/program/lwaftr/tests/data/counters/icmpv6-ping-and-reply.lua index 2e6a730d5a..c0e739a163 100644 --- a/src/program/lwaftr/tests/data/counters/icmpv6-ping-and-reply.lua +++ b/src/program/lwaftr/tests/data/counters/icmpv6-ping-and-reply.lua @@ -3,4 +3,8 @@ return { ["memuse-ipv4-frag-reassembly-buffer"] = 728203264, ["memuse-ipv6-frag-reassembly-buffer"] = 11378176, ["out-ipv6-frag-not"] = 1, + ["in-icmpv6-echo-packets"] = 1, + ["in-icmpv6-echo-bytes"] = 74, + ["out-icmpv6-echo-packets"] = 1, + ["out-icmpv6-echo-bytes"] = 74, } diff --git a/src/program/lwaftr/tests/data/counters/in-1p-ipv4-out-1p-icmpv4.lua b/src/program/lwaftr/tests/data/counters/in-1p-ipv4-out-1p-icmpv4.lua index 39a5560d16..5d25e667a3 100644 --- a/src/program/lwaftr/tests/data/counters/in-1p-ipv4-out-1p-icmpv4.lua +++ b/src/program/lwaftr/tests/data/counters/in-1p-ipv4-out-1p-icmpv4.lua @@ -8,8 +8,8 @@ return { ["in-ipv4-packets"] = 1, ["memuse-ipv4-frag-reassembly-buffer"] = 728203264, ["memuse-ipv6-frag-reassembly-buffer"] = 11378176, - ["out-icmpv4-bytes"] = 94, - ["out-icmpv4-packets"] = 1, + ["out-icmpv4-error-bytes"] = 94, + ["out-icmpv4-error-packets"] = 1, ["out-ipv4-bytes"] = 94, ["out-ipv4-frag-not"] = 1, ["out-ipv4-packets"] = 1, diff --git a/src/program/lwaftr/tests/data/counters/in-1p-ipv4-out-1p-ipv6-echo.lua b/src/program/lwaftr/tests/data/counters/in-1p-ipv4-out-1p-ipv6-echo.lua index 418eba156c..9e45c62802 100644 --- a/src/program/lwaftr/tests/data/counters/in-1p-ipv4-out-1p-ipv6-echo.lua +++ b/src/program/lwaftr/tests/data/counters/in-1p-ipv4-out-1p-ipv6-echo.lua @@ -8,4 +8,8 @@ return { ["out-ipv6-bytes"] = 106, ["out-ipv6-frag-not"] = 1, ["out-ipv6-packets"] = 1, + ["in-icmpv4-echo-packets"] = 1, + ["in-icmpv4-echo-bytes"] = 54, + ["out-icmpv4-echo-packets"] = 1, + ["out-icmpv4-echo-bytes"] = 54, } diff --git a/src/program/lwaftr/tests/data/counters/in-1p-ipv6-out-1p-icmpv4-1.lua b/src/program/lwaftr/tests/data/counters/in-1p-ipv6-out-1p-icmpv4-1.lua index c9191edd5b..b9eb814f37 100644 --- a/src/program/lwaftr/tests/data/counters/in-1p-ipv6-out-1p-icmpv4-1.lua +++ b/src/program/lwaftr/tests/data/counters/in-1p-ipv6-out-1p-icmpv4-1.lua @@ -4,8 +4,8 @@ return { ["in-ipv6-packets"] = 1, ["memuse-ipv4-frag-reassembly-buffer"] = 728203264, ["memuse-ipv6-frag-reassembly-buffer"] = 11378176, - ["out-icmpv4-bytes"] = 94, - ["out-icmpv4-packets"] = 1, + ["out-icmpv4-error-bytes"] = 94, + ["out-icmpv4-error-packets"] = 1, ["out-ipv4-bytes"] = 94, ["out-ipv4-frag-not"] = 1, ["out-ipv4-packets"] = 1, diff --git a/src/program/lwaftr/tests/data/counters/in-1p-ipv6-out-1p-icmpv6-1.lua b/src/program/lwaftr/tests/data/counters/in-1p-ipv6-out-1p-icmpv6-1.lua index 16190036a3..9adebfee0f 100644 --- a/src/program/lwaftr/tests/data/counters/in-1p-ipv6-out-1p-icmpv6-1.lua +++ b/src/program/lwaftr/tests/data/counters/in-1p-ipv6-out-1p-icmpv6-1.lua @@ -8,8 +8,8 @@ return { ["in-ipv6-packets"] = 1, ["memuse-ipv4-frag-reassembly-buffer"] = 728203264, ["memuse-ipv6-frag-reassembly-buffer"] = 11378176, - ["out-icmpv6-bytes"] = 154, - ["out-icmpv6-packets"] = 1, + ["out-icmpv6-error-bytes"] = 154, + ["out-icmpv6-error-packets"] = 1, ["out-ipv6-bytes"] = 154, ["out-ipv6-packets"] = 1, ["out-ipv6-frag-not"] = 1, diff --git a/src/program/lwaftr/tests/data/counters/in-1p-ipv6-out-1p-icmpv6-2.lua b/src/program/lwaftr/tests/data/counters/in-1p-ipv6-out-1p-icmpv6-2.lua index 2c41896cac..22195784b1 100644 --- a/src/program/lwaftr/tests/data/counters/in-1p-ipv6-out-1p-icmpv6-2.lua +++ b/src/program/lwaftr/tests/data/counters/in-1p-ipv6-out-1p-icmpv6-2.lua @@ -8,8 +8,8 @@ return { ["in-ipv6-packets"] = 1, ["memuse-ipv4-frag-reassembly-buffer"] = 728203264, ["memuse-ipv6-frag-reassembly-buffer"] = 11378176, - ["out-icmpv6-bytes"] = 186, - ["out-icmpv6-packets"] = 1, + ["out-icmpv6-error-bytes"] = 186, + ["out-icmpv6-error-packets"] = 1, ["out-ipv6-bytes"] = 186, ["out-ipv6-packets"] = 1, ["out-ipv6-frag-not"] = 1, diff --git a/src/program/lwaftr/tests/data/counters/in-1p-ipv6-out-1p-ipv4-4-and-echo.lua b/src/program/lwaftr/tests/data/counters/in-1p-ipv6-out-1p-ipv4-4-and-echo.lua index b567ab93b3..f9e6b09f02 100644 --- a/src/program/lwaftr/tests/data/counters/in-1p-ipv6-out-1p-ipv4-4-and-echo.lua +++ b/src/program/lwaftr/tests/data/counters/in-1p-ipv6-out-1p-ipv4-4-and-echo.lua @@ -8,4 +8,8 @@ return { ["out-ipv4-frag-not"] = 1, ["out-ipv4-packets"] = 1, ["out-ipv6-frag-not"] = 1, + ["in-icmpv6-echo-packets"] = 1, + ["in-icmpv6-echo-bytes"] = 74, + ["out-icmpv6-echo-packets"] = 1, + ["out-icmpv6-echo-bytes"] = 74, } diff --git a/src/program/lwaftr/tests/data/counters/in-1p-ipv6-out-1p-ipv4-hoplimhair.lua b/src/program/lwaftr/tests/data/counters/in-1p-ipv6-out-1p-ipv4-hoplimhair.lua index de987128b6..7ebbc90158 100644 --- a/src/program/lwaftr/tests/data/counters/in-1p-ipv6-out-1p-ipv4-hoplimhair.lua +++ b/src/program/lwaftr/tests/data/counters/in-1p-ipv6-out-1p-ipv4-hoplimhair.lua @@ -4,8 +4,8 @@ return { ["in-ipv6-packets"] = 1, ["memuse-ipv4-frag-reassembly-buffer"] = 728203264, ["memuse-ipv6-frag-reassembly-buffer"] = 11378176, - ["out-icmpv4-bytes"] = 94, - ["out-icmpv4-packets"] = 1, + ["out-icmpv4-error-bytes"] = 94, + ["out-icmpv4-error-packets"] = 1, ["out-ipv6-bytes"] = 134, ["out-ipv6-frag-not"] = 1, ["out-ipv6-packets"] = 1, diff --git a/src/program/lwaftr/tests/data/counters/in-ipv4-ipv6-out-icmpv4-ipv6-hairpin-1.lua b/src/program/lwaftr/tests/data/counters/in-ipv4-ipv6-out-icmpv4-ipv6-hairpin-1.lua index c1da09ef34..cd7abb7e60 100644 --- a/src/program/lwaftr/tests/data/counters/in-ipv4-ipv6-out-icmpv4-ipv6-hairpin-1.lua +++ b/src/program/lwaftr/tests/data/counters/in-ipv4-ipv6-out-icmpv4-ipv6-hairpin-1.lua @@ -10,8 +10,8 @@ return { ["in-ipv6-packets"] = 1, ["memuse-ipv4-frag-reassembly-buffer"] = 728203264, ["memuse-ipv6-frag-reassembly-buffer"] = 11378176, - ["out-icmpv4-bytes"] = 94, - ["out-icmpv4-packets"] = 1, + ["out-icmpv4-error-bytes"] = 94, + ["out-icmpv4-error-packets"] = 1, ["out-ipv6-bytes"] = 134, ["out-ipv6-frag-not"] = 1, ["out-ipv6-packets"] = 1, diff --git a/src/program/lwaftr/tests/data/counters/ndp-no-na-next-hop6-mac-not-set-2pkts.lua b/src/program/lwaftr/tests/data/counters/ndp-no-na-next-hop6-mac-not-set-2pkts.lua index 0bfb63a5fd..fdffa2d088 100644 --- a/src/program/lwaftr/tests/data/counters/ndp-no-na-next-hop6-mac-not-set-2pkts.lua +++ b/src/program/lwaftr/tests/data/counters/ndp-no-na-next-hop6-mac-not-set-2pkts.lua @@ -12,4 +12,6 @@ return { ["out-ipv6-bytes"] = 106, ["out-ipv6-frag-not"] = 1, ["out-ipv6-packets"] = 1, + ["out-ndp-ns-packets"] = 1, + ["out-ndp-ns-bytes"] = 86, } diff --git a/src/program/lwaftr/tests/data/counters/ndp-no-na-next-hop6-mac-not-set-3pkts.lua b/src/program/lwaftr/tests/data/counters/ndp-no-na-next-hop6-mac-not-set-3pkts.lua index b85c3e5ded..5ae37831cf 100644 --- a/src/program/lwaftr/tests/data/counters/ndp-no-na-next-hop6-mac-not-set-3pkts.lua +++ b/src/program/lwaftr/tests/data/counters/ndp-no-na-next-hop6-mac-not-set-3pkts.lua @@ -12,4 +12,8 @@ return { ["out-ipv6-bytes"] = 106, ["out-ipv6-frag-not"] = 2, ["out-ipv6-packets"] = 1, + ["out-ndp-ns-packets"] = 1, + ["out-ndp-ns-bytes"] = 86, + ["in-ndp-na-packets"] = 1, + ["in-ndp-na-bytes"] = 86, } diff --git a/src/program/lwaftr/tests/data/counters/ndp-ns-for-next-hop.lua b/src/program/lwaftr/tests/data/counters/ndp-ns-for-next-hop.lua index 2a8197e9c0..24f4b3e4f8 100644 --- a/src/program/lwaftr/tests/data/counters/ndp-ns-for-next-hop.lua +++ b/src/program/lwaftr/tests/data/counters/ndp-ns-for-next-hop.lua @@ -2,4 +2,6 @@ return { ["memuse-ipv4-frag-reassembly-buffer"] = 728203264, ["memuse-ipv6-frag-reassembly-buffer"] = 11378176, ["out-ipv6-frag-not"] = 1, + ["out-ndp-ns-packets"] = 1, + ["out-ndp-ns-bytes"] = 86, } diff --git a/src/program/lwaftr/tests/data/counters/ndp-secondary.lua b/src/program/lwaftr/tests/data/counters/ndp-secondary.lua index 3ce9e1bc91..21e9924102 100644 --- a/src/program/lwaftr/tests/data/counters/ndp-secondary.lua +++ b/src/program/lwaftr/tests/data/counters/ndp-secondary.lua @@ -2,4 +2,6 @@ return { ["in-ipv6-frag-reassembly-unneeded"] = 1, ["memuse-ipv4-frag-reassembly-buffer"] = 728203264, ["memuse-ipv6-frag-reassembly-buffer"] = 11378176, + ["in-ndp-ns-packets"] = 1, + ["in-ndp-ns-bytes"] = 86, } diff --git a/src/program/lwaftr/tests/data/counters/nofrag4-echo.lua b/src/program/lwaftr/tests/data/counters/nofrag4-echo.lua new file mode 100644 index 0000000000..c65654e0f1 --- /dev/null +++ b/src/program/lwaftr/tests/data/counters/nofrag4-echo.lua @@ -0,0 +1,10 @@ +return { + ["in-ipv4-frag-reassembly-unneeded"] = 1, + ["memuse-ipv4-frag-reassembly-buffer"] = 728203264, + ["memuse-ipv6-frag-reassembly-buffer"] = 11378176, + ["out-ipv4-frag-not"] = 1, + ["out-icmpv4-echo-packets"] = 1, + ["out-icmpv4-echo-bytes"] = 54, + ["in-icmpv4-echo-packets"] = 1, + ["in-icmpv4-echo-bytes"] = 54, +} diff --git a/src/program/lwaftr/tests/data/counters/nofrag4.lua b/src/program/lwaftr/tests/data/counters/nofrag4.lua index 6e95815eba..bc266a4c02 100644 --- a/src/program/lwaftr/tests/data/counters/nofrag4.lua +++ b/src/program/lwaftr/tests/data/counters/nofrag4.lua @@ -3,4 +3,8 @@ return { ["memuse-ipv4-frag-reassembly-buffer"] = 728203264, ["memuse-ipv6-frag-reassembly-buffer"] = 11378176, ["out-ipv4-frag-not"] = 1, + ["out-arp-reply-packets"] = 1, + ["out-arp-reply-bytes"] = 42, + ["in-arp-request-bytes"] = 42, + ["in-arp-request-packets"] = 1, } diff --git a/src/program/lwaftr/tests/data/counters/nofrag6-no-icmp.lua b/src/program/lwaftr/tests/data/counters/nofrag6-no-icmp.lua new file mode 100644 index 0000000000..3ce9e1bc91 --- /dev/null +++ b/src/program/lwaftr/tests/data/counters/nofrag6-no-icmp.lua @@ -0,0 +1,5 @@ +return { + ["in-ipv6-frag-reassembly-unneeded"] = 1, + ["memuse-ipv4-frag-reassembly-buffer"] = 728203264, + ["memuse-ipv6-frag-reassembly-buffer"] = 11378176, +} diff --git a/src/program/lwaftr/tests/data/counters/nofrag6-sol.lua b/src/program/lwaftr/tests/data/counters/nofrag6-sol.lua index 2e6a730d5a..70f8ca17ab 100644 --- a/src/program/lwaftr/tests/data/counters/nofrag6-sol.lua +++ b/src/program/lwaftr/tests/data/counters/nofrag6-sol.lua @@ -3,4 +3,8 @@ return { ["memuse-ipv4-frag-reassembly-buffer"] = 728203264, ["memuse-ipv6-frag-reassembly-buffer"] = 11378176, ["out-ipv6-frag-not"] = 1, + ["out-ndp-na-packets"] = 1, + ["out-ndp-na-bytes"] = 86, + ["in-ndp-ns-packets"] = 1, + ["in-ndp-ns-bytes"] = 86, } diff --git a/src/program/lwaftr/tests/data/counters/nofrag6.lua b/src/program/lwaftr/tests/data/counters/nofrag6.lua index 3ce9e1bc91..21e9924102 100644 --- a/src/program/lwaftr/tests/data/counters/nofrag6.lua +++ b/src/program/lwaftr/tests/data/counters/nofrag6.lua @@ -2,4 +2,6 @@ return { ["in-ipv6-frag-reassembly-unneeded"] = 1, ["memuse-ipv4-frag-reassembly-buffer"] = 728203264, ["memuse-ipv6-frag-reassembly-buffer"] = 11378176, + ["in-ndp-ns-packets"] = 1, + ["in-ndp-ns-bytes"] = 86, } diff --git a/src/program/lwaftr/tests/data/counters/tcp-frominet-bound-ttl1.lua b/src/program/lwaftr/tests/data/counters/tcp-frominet-bound-ttl1.lua index 55ea25db4b..6e67dbfde1 100644 --- a/src/program/lwaftr/tests/data/counters/tcp-frominet-bound-ttl1.lua +++ b/src/program/lwaftr/tests/data/counters/tcp-frominet-bound-ttl1.lua @@ -8,8 +8,8 @@ return { ["in-ipv4-packets"] = 1, ["memuse-ipv4-frag-reassembly-buffer"] = 728203264, ["memuse-ipv6-frag-reassembly-buffer"] = 11378176, - ["out-icmpv4-bytes"] = 94, - ["out-icmpv4-packets"] = 1, + ["out-icmpv4-error-bytes"] = 94, + ["out-icmpv4-error-packets"] = 1, ["out-ipv4-bytes"] = 94, ["out-ipv4-frag-not"] = 1, ["out-ipv4-packets"] = 1, diff --git a/src/program/lwaftr/tests/data/ndp_outgoing_ns.pcap b/src/program/lwaftr/tests/data/ndp_outgoing_ns.pcap index 1169e042ce..871ce745b7 100644 Binary files a/src/program/lwaftr/tests/data/ndp_outgoing_ns.pcap and b/src/program/lwaftr/tests/data/ndp_outgoing_ns.pcap differ diff --git a/src/program/lwaftr/tests/data/vlan/ndp_ns_and_recap.pcap b/src/program/lwaftr/tests/data/vlan/ndp_ns_and_recap.pcap index e8fbd3971f..6d75e56a8d 100644 Binary files a/src/program/lwaftr/tests/data/vlan/ndp_ns_and_recap.pcap and b/src/program/lwaftr/tests/data/vlan/ndp_ns_and_recap.pcap differ diff --git a/src/program/lwaftr/tests/data/vlan/ndp_outgoing_ns.pcap b/src/program/lwaftr/tests/data/vlan/ndp_outgoing_ns.pcap index a829f9b457..6320e70cbb 100644 Binary files a/src/program/lwaftr/tests/data/vlan/ndp_outgoing_ns.pcap and b/src/program/lwaftr/tests/data/vlan/ndp_outgoing_ns.pcap differ diff --git a/src/program/lwaftr/tests/end-to-end/test_env.sh b/src/program/lwaftr/tests/end-to-end/test_env.sh index 632f536a11..5204ac2f71 100755 --- a/src/program/lwaftr/tests/end-to-end/test_env.sh +++ b/src/program/lwaftr/tests/end-to-end/test_env.sh @@ -365,7 +365,7 @@ TEST_DATA=( "ingress-filter: from-b4 (IPv6) packet found in binding table (DROP)" "no_icmp_with_filters_drop.conf" "" "tcp-fromb4-ipv6.pcap" "" "" -"nofrag6.lua" +"nofrag6-no-icmp.lua" # Egress filters @@ -375,7 +375,7 @@ TEST_DATA=( "egress-filter: to-internet (IPv4) (DROP)" "no_icmp_with_filters_drop.conf" "" "tcp-fromb4-ipv6.pcap" "" "" -"nofrag6.lua" +"nofrag6-no-icmp.lua" "egress-filter: to-b4 (IPv4) (ACCEPT)" "no_icmp_with_filters_accept.conf" "tcp-frominet-trafficclass.pcap" "" "" "tcp-afteraftr-ipv6-trafficclass.pcap" @@ -389,11 +389,11 @@ TEST_DATA=( "ICMP Echo to AFTR (IPv4)" "no_icmp.conf" "ping-v4.pcap" "" "ping-v4-reply.pcap" "" -"nofrag4.lua" +"nofrag4-echo.lua" "ICMP Echo to AFTR (IPv4) (ttl=32)" "no_icmp.conf" "ping-v4-ttl-32.pcap" "" "ping-v4-reply.pcap" "" -"nofrag4.lua" +"nofrag4-echo.lua" "ICMP Echo to AFTR (IPv4) + data" "no_icmp.conf" "ping-v4-and-data.pcap" "" "ping-v4-reply.pcap" "tcp-afteraftr-ipv6.pcap" diff --git a/src/program/lwaftr/tests/propbased/genyang.lua b/src/program/lwaftr/tests/propbased/genyang.lua index ec22103cdb..7459516456 100644 --- a/src/program/lwaftr/tests/propbased/genyang.lua +++ b/src/program/lwaftr/tests/propbased/genyang.lua @@ -12,7 +12,7 @@ local util = require("lib.yang.util") local capabilities = {['ietf-softwire-br']={feature={'binding'}},} require('lib.yang.schema').set_default_capabilities(capabilities) -local schemas = { "ietf-softwire-br", "snabb-softwire-v2" } +local schemas = { "ietf-softwire-br", "snabb-softwire-v3" } -- choose an element of an array randomly local function choose(choices) @@ -501,7 +501,7 @@ end function selftest() print('selftest: program.lwaftr.tests.propbased.genyang') - local schema = schema.load_schema_by_name("snabb-softwire-v2") + local schema = schema.load_schema_by_name("snabb-softwire-v3") local grammar = data.config_grammar_from_schema(schema) for i=1,1000 do generate_xpath_and_val(schema, true) end diff --git a/src/program/lwaftr/tests/subcommands/config_test.py b/src/program/lwaftr/tests/subcommands/config_test.py index 8c7d999cfb..93f6fcd148 100644 --- a/src/program/lwaftr/tests/subcommands/config_test.py +++ b/src/program/lwaftr/tests/subcommands/config_test.py @@ -68,7 +68,7 @@ class TestConfigGet(BaseTestCase): """ daemon_args = DAEMON_ARGS - config_args = (str(SNABB_CMD), 'config', 'get', '--schema=snabb-softwire-v2', DAEMON_PROC_NAME) + config_args = (str(SNABB_CMD), 'config', 'get', '--schema=snabb-softwire-v3', DAEMON_PROC_NAME) @classmethod def setUpClass(cls): @@ -130,7 +130,7 @@ class TestConfigMultiproc(BaseTestCase): daemon = None daemon_args = DAEMON_ARGS ps_args = (str(SNABB_CMD), 'ps') - config_args = (str(SNABB_CMD), 'config', 'XXX', '--schema=snabb-softwire-v2', DAEMON_PROC_NAME) + config_args = (str(SNABB_CMD), 'config', 'XXX', '--schema=snabb-softwire-v3', DAEMON_PROC_NAME) @classmethod def setUpClass(cls): @@ -396,7 +396,7 @@ def setUpClass(cls): cls.reportAndFail('Config manager socket not present', None) def get_cmd_args(self, action): - cmd_args = list((str(SNABB_CMD), 'config', 'XXX', '--schema=snabb-softwire-v2', DAEMON_PROC_NAME)) + cmd_args = list((str(SNABB_CMD), 'config', 'XXX', '--schema=snabb-softwire-v3', DAEMON_PROC_NAME)) cmd_args[2] = action return cmd_args diff --git a/src/program/snabbnfv/traffic/traffic.lua b/src/program/snabbnfv/traffic/traffic.lua index 12ea0f7bf3..883c6f360c 100644 --- a/src/program/snabbnfv/traffic/traffic.lua +++ b/src/program/snabbnfv/traffic/traffic.lua @@ -9,7 +9,6 @@ local ffi = require("ffi") local C = ffi.C local timer = require("core.timer") local pci = require("lib.hardware.pci") -local ingress_drop_monitor = require("lib.timers.ingress_drop_monitor") local counter = require("core.counter") local long_opts = { @@ -91,7 +90,6 @@ function traffic (pciaddr, confpath, sockpath) timer.activate(timer.new("reconf", check_for_reconfigure, 1e9, 'repeating')) -- Flush logs every second. timer.activate(timer.new("flush", io.flush, 1e9, 'repeating')) - timer.activate(ingress_drop_monitor.new({action='warn'}):timer()) while true do needs_reconfigure = false print("Loading " .. confpath) diff --git a/src/program/snabbvmx/query/example1.xml b/src/program/snabbvmx/query/example1.xml index 3e19ca584a..796896f612 100644 --- a/src/program/snabbvmx/query/example1.xml +++ b/src/program/snabbvmx/query/example1.xml @@ -92,10 +92,10 @@ 114681497770 0 0 - 0 - 0 - 0 - 0 + 0 + 0 + 0 + 0 5119140314 0 0 diff --git a/src/program/snabbvmx/query/example2.xml b/src/program/snabbvmx/query/example2.xml index 9ac3997759..8b42e0dcfb 100644 --- a/src/program/snabbvmx/query/example2.xml +++ b/src/program/snabbvmx/query/example2.xml @@ -83,10 +83,10 @@ 0 0 0 - 0 - 0 - 0 - 0 + 0 + 0 + 0 + 0 0 0 0 @@ -320,10 +320,10 @@ 0 0 0 - 0 - 0 - 0 - 0 + 0 + 0 + 0 + 0 0 0 0 diff --git a/src/program/top/top.lua b/src/program/top/top.lua index 38c6d2ea20..6db5d3617f 100644 --- a/src/program/top/top.lua +++ b/src/program/top/top.lua @@ -606,9 +606,24 @@ function compute_display_tree.interface(tree, prev, dt, t) -- \- pci device, macaddr, mtu, speed -- RX: PPS, bps, %, [drops/s] -- TX: PPS, bps, %, [drops/s] + function queue_local_key(key, counters) + local queue_key + local stem = ({rxdrop='rxdrops'})[key] or key + for i=0,15 do + local k = 'q'..i..'_'..stem + if counters[k] then + if queue_key then + return key + end + queue_key = k + end + end + return queue_key or key + end local function rate(key, counters, prev) if not counters then return 0/0 end if not counters[key] then return 0/0 end + key = queue_local_key(key, counters) local v, rrd = counters[key], nil prev = prev and prev[key] if is_leaf(v) then @@ -628,7 +643,7 @@ function compute_display_tree.interface(tree, prev, dt, t) rchars('%s:', tag:upper()), lchars('%.3f %sPPS', scale(pps)), lchars('%.3f %sbps', scale(bps)), - lchars('%.2f%%', bps/max*100), + max > 0 and lchars('%.2f%%', bps/max*100) or nil, drops > 0 and rchars('%.3f %sPPS dropped', scale(drops)) or nil) end local function show_pci(addr, pci, prev) @@ -636,7 +651,8 @@ function compute_display_tree.interface(tree, prev, dt, t) gridrow(rchars('| '), lchars('')) gridrow(rchars('\\-'), rchars('%s:', addr), - lchars('%d %sbE, MAC: %s', bps, tag, + lchars('%sMAC: %s', + (bps > 0 and ("%d %sbE, "):format(bps, tag)) or '', macaddr_string(tonumber(pci.macaddr and pci.macaddr.value) or 0))) show_traffic('rx', pci, prev) show_traffic('tx', pci, prev)