cgroups: Fix systemd cgroup support

As github.com/containerd/cgroups doesn't support scope
units which are essential in some cases lets create
the cgroups manually and load it trough the cgroups
api
This is currently done only when there's single sandbox
cgroup (sandbox_cgroup_only=true), otherwise we set it
as static cgroup path as it used to be (until a proper
soultion for overhead cgroup under systemd will be
suggested)

Fixes: #2868
Signed-off-by: Snir Sheriber <ssheribe@redhat.com>
This commit is contained in:
Snir Sheriber 2021-11-02 16:57:25 +02:00
parent b34ed403c5
commit bcf181b7ee
135 changed files with 22391 additions and 5 deletions

View File

@ -13,6 +13,7 @@ require (
github.com/containerd/ttrpc v1.0.2
github.com/containerd/typeurl v1.0.2
github.com/containernetworking/plugins v0.9.1
github.com/coreos/go-systemd/v22 v22.3.2
github.com/cri-o/cri-o v1.0.0-rc2.0.20170928185954-3394b3b2d6af
github.com/fsnotify/fsnotify v1.4.9
github.com/go-ini/ini v1.28.2
@ -21,6 +22,7 @@ require (
github.com/go-openapi/strfmt v0.18.0
github.com/go-openapi/swag v0.19.5
github.com/go-openapi/validate v0.18.0
github.com/godbus/dbus/v5 v5.0.4
github.com/gogo/protobuf v1.3.2
github.com/hashicorp/go-multierror v1.0.0
github.com/intel-go/cpuid v0.0.0-20210602155658-5747e5cec0d9

View File

@ -97,6 +97,7 @@ github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e/go.mod h1:nSuG5e5P
github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1/go.mod h1:Q3SI9o4m/ZMnBNeIyt5eFwwo7qiLfzFZmjNmxjkiQlU=
github.com/cilium/ebpf v0.2.0/go.mod h1:To2CFviqOWL/M0gIMsvSMlqe7em/l1ALkX1PyjrX2Qs=
github.com/cilium/ebpf v0.4.0/go.mod h1:4tRaxcgiL706VnOzHOdBlY8IEAIdxINsQBcU4xJJXRs=
github.com/cilium/ebpf v0.6.2 h1:iHsfF/t4aW4heW2YKfeHrVPGdtYTL4C4KocpM8KTSnI=
github.com/cilium/ebpf v0.6.2/go.mod h1:4tRaxcgiL706VnOzHOdBlY8IEAIdxINsQBcU4xJJXRs=
github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw=
github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc=
@ -154,6 +155,7 @@ github.com/cpuguy83/go-md2man/v2 v2.0.0/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsr
github.com/creack/pty v1.1.7/go.mod h1:lj5s0c3V2DBrqTV7llrYr5NG6My20zk30Fl46Y7DoTY=
github.com/cri-o/cri-o v1.0.0-rc2.0.20170928185954-3394b3b2d6af h1:H6nLV96F1LkWizYLQtrMtqJBrlJxnpjgisHsTsOS2HU=
github.com/cri-o/cri-o v1.0.0-rc2.0.20170928185954-3394b3b2d6af/go.mod h1:POmDVglzQ2jWTlL9ZCfZ8d1QjLhmk0oB36O8T0oG75Y=
github.com/cyphar/filepath-securejoin v0.2.2 h1:jCwT2GTP+PY5nBz3c/YL5PAIbusElVrPujOBSCj8xRg=
github.com/cyphar/filepath-securejoin v0.2.2/go.mod h1:FpkQEhXnPnOthhzymB7CGsFk2G9VLXONKD9G7QGMM+4=
github.com/d2g/dhcp4 v0.0.0-20170904100407-a1d1b6c41b1c/go.mod h1:Ct2BUK8SB0YC1SMSibvLzxjeJLnrYEVLULFNiHY9YfQ=
github.com/d2g/dhcp4client v1.0.0/go.mod h1:j0hNfjhrt2SxUOw55nL0ATM/z4Yt3t2Kd1mW34z5W5s=
@ -183,6 +185,7 @@ github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7
github.com/evanphx/json-patch v4.9.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk=
github.com/fatih/color v1.7.0/go.mod h1:Zm6kSWBoL9eyXnKyktHP6abPY2pDugNf5KwzbycvMj4=
github.com/form3tech-oss/jwt-go v3.2.2+incompatible/go.mod h1:pbq4aXjuKjdthFRnoDwaVPLA+WlJuPGy+QneDUgJi2k=
github.com/frankban/quicktest v1.11.3 h1:8sXhOn0uLys67V8EsXLc6eszDs8VXWxL3iRvebPhedY=
github.com/frankban/quicktest v1.11.3/go.mod h1:wRf/ReqHper53s+kmmSZizM8NamnL3IM0I9ntUbOk+k=
github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo=
github.com/fsnotify/fsnotify v1.4.9 h1:hsms1Qyu0jgnwNXIxa+/V/PDsU6CfLf6CNO8H7IWoS4=
@ -480,6 +483,7 @@ github.com/russross/blackfriday/v2 v2.0.1/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQD
github.com/safchain/ethtool v0.0.0-20190326074333-42ed695e3de8 h1:2c1EFnZHIPCW8qKWgHMH/fX2PkSabFc5mrVzfUNdg5U=
github.com/safchain/ethtool v0.0.0-20190326074333-42ed695e3de8/go.mod h1:Z0q5wiBQGYcxhMZ6gUqHn6pYNLypFAvaL3UvgZLR0U4=
github.com/satori/go.uuid v1.2.0/go.mod h1:dA0hQrYB0VpLJoorglMZABFdXlWrHn1NEOzdhQKdks0=
github.com/seccomp/libseccomp-golang v0.9.1 h1:NJjM5DNFOs0s3kYE1WUOr6G8V97sdt46rlXTMfXGWBo=
github.com/seccomp/libseccomp-golang v0.9.1/go.mod h1:GbW5+tmTXfcxTToHLXlScSlAvWlF4P2Ca7zGrPiEpWo=
github.com/shurcooL/sanitized_anchor_name v1.0.0 h1:PdmoCO6wvbs+7yrJyMORt4/BmY5IYyJwS/kOiWx8mHo=
github.com/shurcooL/sanitized_anchor_name v1.0.0/go.mod h1:1NzhyTcUVG4SuEtjjoZeVRXNmyL/1OwPU0+IJeTBvfc=

View File

@ -0,0 +1,17 @@
---
Language: Cpp
BasedOnStyle: LLVM
AlignAfterOpenBracket: DontAlign
AlignConsecutiveAssignments: true
AlignEscapedNewlines: DontAlign
AlwaysBreakBeforeMultilineStrings: true
AlwaysBreakTemplateDeclarations: false
AllowAllParametersOfDeclarationOnNextLine: false
AllowShortFunctionsOnASingleLine: false
BreakBeforeBraces: Attach
IndentWidth: 4
KeepEmptyLinesAtTheStartOfBlocks: false
TabWidth: 4
UseTab: ForContinuationAndIndentation
ColumnLimit: 1000
...

13
src/runtime/vendor/github.com/cilium/ebpf/.gitignore generated vendored Normal file
View File

@ -0,0 +1,13 @@
# Binaries for programs and plugins
*.exe
*.exe~
*.dll
*.so
*.dylib
*.o
# Test binary, build with `go test -c`
*.test
# Output of the go coverage tool, specifically when used with LiteIDE
*.out

View File

@ -0,0 +1,29 @@
---
issues:
exclude-rules:
# syscall param structs will have unused fields in Go code.
- path: syscall.*.go
linters:
- structcheck
linters:
disable-all: true
enable:
- deadcode
- errcheck
- goimports
- gosimple
- govet
- ineffassign
- misspell
- staticcheck
- structcheck
- typecheck
- unused
- varcheck
# Could be enabled later:
# - gocyclo
# - prealloc
# - maligned
# - gosec

View File

@ -0,0 +1,80 @@
Architecture of the library
===
ELF -> Specifications -> Objects -> Links
ELF
---
BPF is usually produced by using Clang to compile a subset of C. Clang outputs
an ELF file which contains program byte code (aka BPF), but also metadata for
maps used by the program. The metadata follows the conventions set by libbpf
shipped with the kernel. Certain ELF sections have special meaning
and contain structures defined by libbpf. Newer versions of clang emit
additional metadata in BPF Type Format (aka BTF).
The library aims to be compatible with libbpf so that moving from a C toolchain
to a Go one creates little friction. To that end, the [ELF reader](elf_reader.go)
is tested against the Linux selftests and avoids introducing custom behaviour
if possible.
The output of the ELF reader is a `CollectionSpec` which encodes
all of the information contained in the ELF in a form that is easy to work with
in Go.
### BTF
The BPF Type Format describes more than just the types used by a BPF program. It
includes debug aids like which source line corresponds to which instructions and
what global variables are used.
[BTF parsing](internal/btf/) lives in a separate internal package since exposing
it would mean an additional maintenance burden, and because the API still
has sharp corners. The most important concept is the `btf.Type` interface, which
also describes things that aren't really types like `.rodata` or `.bss` sections.
`btf.Type`s can form cyclical graphs, which can easily lead to infinite loops if
one is not careful. Hopefully a safe pattern to work with `btf.Type` emerges as
we write more code that deals with it.
Specifications
---
`CollectionSpec`, `ProgramSpec` and `MapSpec` are blueprints for in-kernel
objects and contain everything necessary to execute the relevant `bpf(2)`
syscalls. Since the ELF reader outputs a `CollectionSpec` it's possible to
modify clang-compiled BPF code, for example to rewrite constants. At the same
time the [asm](asm/) package provides an assembler that can be used to generate
`ProgramSpec` on the fly.
Creating a spec should never require any privileges or be restricted in any way,
for example by only allowing programs in native endianness. This ensures that
the library stays flexible.
Objects
---
`Program` and `Map` are the result of loading specs into the kernel. Sometimes
loading a spec will fail because the kernel is too old, or a feature is not
enabled. There are multiple ways the library deals with that:
* Fallback: older kernels don't allowing naming programs and maps. The library
automatically detects support for names, and omits them during load if
necessary. This works since name is primarily a debug aid.
* Sentinel error: sometimes it's possible to detect that a feature isn't available.
In that case the library will return an error wrapping `ErrNotSupported`.
This is also useful to skip tests that can't run on the current kernel.
Once program and map objects are loaded they expose the kernel's low-level API,
e.g. `NextKey`. Often this API is awkward to use in Go, so there are safer
wrappers on top of the low-level API, like `MapIterator`. The low-level API is
useful as an out when our higher-level API doesn't support a particular use case.
Links
---
BPF can be attached to many different points in the kernel and newer BPF hooks
tend to use bpf_link to do so. Older hooks unfortunately use a combination of
syscalls, netlink messages, etc. Adding support for a new link type should not
pull in large dependencies like netlink, so XDP programs or tracepoints are
out of scope.

View File

@ -0,0 +1,46 @@
# Contributor Covenant Code of Conduct
## Our Pledge
In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation.
## Our Standards
Examples of behavior that contributes to creating a positive environment include:
* Using welcoming and inclusive language
* Being respectful of differing viewpoints and experiences
* Gracefully accepting constructive criticism
* Focusing on what is best for the community
* Showing empathy towards other community members
Examples of unacceptable behavior by participants include:
* The use of sexualized language or imagery and unwelcome sexual attention or advances
* Trolling, insulting/derogatory comments, and personal or political attacks
* Public or private harassment
* Publishing others' private information, such as a physical or electronic address, without explicit permission
* Other conduct which could reasonably be considered inappropriate in a professional setting
## Our Responsibilities
Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior.
Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful.
## Scope
This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers.
## Enforcement
Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at nathanjsweet at gmail dot com or i at lmb dot io. The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately.
Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership.
## Attribution
This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [http://contributor-covenant.org/version/1/4][version]
[homepage]: http://contributor-covenant.org
[version]: http://contributor-covenant.org/version/1/4/

View File

@ -0,0 +1,40 @@
# How to contribute
Development is on [GitHub](https://github.com/cilium/ebpf) and contributions in
the form of pull requests and issues reporting bugs or suggesting new features
are welcome. Please take a look at [the architecture](ARCHITECTURE.md) to get
a better understanding for the high-level goals.
New features must be accompanied by tests. Before starting work on any large
feature, please [join](https://cilium.herokuapp.com/) the
[#libbpf-go](https://cilium.slack.com/messages/libbpf-go) channel on Slack to
discuss the design first.
When submitting pull requests, consider writing details about what problem you
are solving and why the proposed approach solves that problem in commit messages
and/or pull request description to help future library users and maintainers to
reason about the proposed changes.
## Running the tests
Many of the tests require privileges to set resource limits and load eBPF code.
The easiest way to obtain these is to run the tests with `sudo`.
To test the current package with your local kernel you can simply run:
```
go test -exec sudo ./...
```
To test the current package with a different kernel version you can use the [run-tests.sh](run-tests.sh) script.
It requires [virtme](https://github.com/amluto/virtme) and qemu to be installed.
Examples:
```bash
# Run all tests on a 5.4 kernel
./run-tests.sh 5.4
# Run a subset of tests:
./run-tests.sh 5.4 go test ./link
```

23
src/runtime/vendor/github.com/cilium/ebpf/LICENSE generated vendored Normal file
View File

@ -0,0 +1,23 @@
MIT License
Copyright (c) 2017 Nathan Sweet
Copyright (c) 2018, 2019 Cloudflare
Copyright (c) 2019 Authors of Cilium
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

70
src/runtime/vendor/github.com/cilium/ebpf/Makefile generated vendored Normal file
View File

@ -0,0 +1,70 @@
# The development version of clang is distributed as the 'clang' binary,
# while stable/released versions have a version number attached.
# Pin the default clang to a stable version.
CLANG ?= clang-12
CFLAGS := -target bpf -O2 -g -Wall -Werror $(CFLAGS)
# Obtain an absolute path to the directory of the Makefile.
# Assume the Makefile is in the root of the repository.
REPODIR := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
UIDGID := $(shell stat -c '%u:%g' ${REPODIR})
IMAGE := $(shell cat ${REPODIR}/testdata/docker/IMAGE)
VERSION := $(shell cat ${REPODIR}/testdata/docker/VERSION)
# clang <8 doesn't tag relocs properly (STT_NOTYPE)
# clang 9 is the first version emitting BTF
TARGETS := \
testdata/loader-clang-7 \
testdata/loader-clang-9 \
testdata/loader-$(CLANG) \
testdata/invalid_map \
testdata/raw_tracepoint \
testdata/invalid_map_static \
testdata/initialized_btf_map \
testdata/strings \
internal/btf/testdata/relocs
.PHONY: all clean docker-all docker-shell
.DEFAULT_TARGET = docker-all
# Build all ELF binaries using a Dockerized LLVM toolchain.
docker-all:
docker run --rm --user "${UIDGID}" \
-v "${REPODIR}":/ebpf -w /ebpf --env MAKEFLAGS \
--env CFLAGS="-fdebug-prefix-map=/ebpf=." \
"${IMAGE}:${VERSION}" \
make all
# (debug) Drop the user into a shell inside the Docker container as root.
docker-shell:
docker run --rm -ti \
-v "${REPODIR}":/ebpf -w /ebpf \
"${IMAGE}:${VERSION}"
clean:
-$(RM) testdata/*.elf
-$(RM) internal/btf/testdata/*.elf
all: $(addsuffix -el.elf,$(TARGETS)) $(addsuffix -eb.elf,$(TARGETS))
ln -srf testdata/loader-$(CLANG)-el.elf testdata/loader-el.elf
ln -srf testdata/loader-$(CLANG)-eb.elf testdata/loader-eb.elf
testdata/loader-%-el.elf: testdata/loader.c
$* $(CFLAGS) -mlittle-endian -c $< -o $@
testdata/loader-%-eb.elf: testdata/loader.c
$* $(CFLAGS) -mbig-endian -c $< -o $@
%-el.elf: %.c
$(CLANG) $(CFLAGS) -mlittle-endian -c $< -o $@
%-eb.elf : %.c
$(CLANG) $(CFLAGS) -mbig-endian -c $< -o $@
# Usage: make VMLINUX=/path/to/vmlinux vmlinux-btf
.PHONY: vmlinux-btf
vmlinux-btf: internal/btf/testdata/vmlinux-btf.gz
internal/btf/testdata/vmlinux-btf.gz: $(VMLINUX)
objcopy --dump-section .BTF=/dev/stdout "$<" /dev/null | gzip > "$@"

62
src/runtime/vendor/github.com/cilium/ebpf/README.md generated vendored Normal file
View File

@ -0,0 +1,62 @@
# eBPF
[![PkgGoDev](https://pkg.go.dev/badge/github.com/cilium/ebpf)](https://pkg.go.dev/github.com/cilium/ebpf)
eBPF is a pure Go library that provides utilities for loading, compiling, and
debugging eBPF programs. It has minimal external dependencies and is intended to
be used in long running processes.
* [asm](https://pkg.go.dev/github.com/cilium/ebpf/asm) contains a basic
assembler
* [link](https://pkg.go.dev/github.com/cilium/ebpf/link) allows attaching eBPF
to various hooks
* [perf](https://pkg.go.dev/github.com/cilium/ebpf/perf) allows reading from a
`PERF_EVENT_ARRAY`
* [cmd/bpf2go](https://pkg.go.dev/github.com/cilium/ebpf/cmd/bpf2go) allows
compiling and embedding eBPF programs in Go code
The library is maintained by [Cloudflare](https://www.cloudflare.com) and
[Cilium](https://www.cilium.io). Feel free to
[join](https://cilium.herokuapp.com/) the
[#libbpf-go](https://cilium.slack.com/messages/libbpf-go) channel on Slack.
## Current status
The package is production ready, but **the API is explicitly unstable right
now**. Expect to update your code if you want to follow along.
## Getting Started
A small collection of Go and eBPF programs that serve as examples for building
your own tools can be found under [examples/](examples/).
Contributions are highly encouraged, as they highlight certain use cases of
eBPF and the library, and help shape the future of the project.
## Requirements
* A version of Go that is [supported by
upstream](https://golang.org/doc/devel/release.html#policy)
* Linux 4.9, 4.19 or 5.4 (versions in-between should work, but are not tested)
## Useful resources
* [eBPF.io](https://ebpf.io) (recommended)
* [Cilium eBPF documentation](https://docs.cilium.io/en/latest/bpf/#bpf-guide)
(recommended)
* [Linux documentation on
BPF](https://www.kernel.org/doc/html/latest/networking/filter.html)
* [eBPF features by Linux
version](https://github.com/iovisor/bcc/blob/master/docs/kernel-versions.md)
## Regenerating Testdata
Run `make` in the root of this repository to rebuild testdata in all
subpackages. This requires Docker, as it relies on a standardized build
environment to keep the build output stable.
The toolchain image build files are kept in [testdata/docker/](testdata/docker/).
## License
MIT

149
src/runtime/vendor/github.com/cilium/ebpf/asm/alu.go generated vendored Normal file
View File

@ -0,0 +1,149 @@
package asm
//go:generate stringer -output alu_string.go -type=Source,Endianness,ALUOp
// Source of ALU / ALU64 / Branch operations
//
// msb lsb
// +----+-+---+
// |op |S|cls|
// +----+-+---+
type Source uint8
const sourceMask OpCode = 0x08
// Source bitmask
const (
// InvalidSource is returned by getters when invoked
// on non ALU / branch OpCodes.
InvalidSource Source = 0xff
// ImmSource src is from constant
ImmSource Source = 0x00
// RegSource src is from register
RegSource Source = 0x08
)
// The Endianness of a byte swap instruction.
type Endianness uint8
const endianMask = sourceMask
// Endian flags
const (
InvalidEndian Endianness = 0xff
// Convert to little endian
LE Endianness = 0x00
// Convert to big endian
BE Endianness = 0x08
)
// ALUOp are ALU / ALU64 operations
//
// msb lsb
// +----+-+---+
// |OP |s|cls|
// +----+-+---+
type ALUOp uint8
const aluMask OpCode = 0xf0
const (
// InvalidALUOp is returned by getters when invoked
// on non ALU OpCodes
InvalidALUOp ALUOp = 0xff
// Add - addition
Add ALUOp = 0x00
// Sub - subtraction
Sub ALUOp = 0x10
// Mul - multiplication
Mul ALUOp = 0x20
// Div - division
Div ALUOp = 0x30
// Or - bitwise or
Or ALUOp = 0x40
// And - bitwise and
And ALUOp = 0x50
// LSh - bitwise shift left
LSh ALUOp = 0x60
// RSh - bitwise shift right
RSh ALUOp = 0x70
// Neg - sign/unsign signing bit
Neg ALUOp = 0x80
// Mod - modulo
Mod ALUOp = 0x90
// Xor - bitwise xor
Xor ALUOp = 0xa0
// Mov - move value from one place to another
Mov ALUOp = 0xb0
// ArSh - arithmatic shift
ArSh ALUOp = 0xc0
// Swap - endian conversions
Swap ALUOp = 0xd0
)
// HostTo converts from host to another endianness.
func HostTo(endian Endianness, dst Register, size Size) Instruction {
var imm int64
switch size {
case Half:
imm = 16
case Word:
imm = 32
case DWord:
imm = 64
default:
return Instruction{OpCode: InvalidOpCode}
}
return Instruction{
OpCode: OpCode(ALUClass).SetALUOp(Swap).SetSource(Source(endian)),
Dst: dst,
Constant: imm,
}
}
// Op returns the OpCode for an ALU operation with a given source.
func (op ALUOp) Op(source Source) OpCode {
return OpCode(ALU64Class).SetALUOp(op).SetSource(source)
}
// Reg emits `dst (op) src`.
func (op ALUOp) Reg(dst, src Register) Instruction {
return Instruction{
OpCode: op.Op(RegSource),
Dst: dst,
Src: src,
}
}
// Imm emits `dst (op) value`.
func (op ALUOp) Imm(dst Register, value int32) Instruction {
return Instruction{
OpCode: op.Op(ImmSource),
Dst: dst,
Constant: int64(value),
}
}
// Op32 returns the OpCode for a 32-bit ALU operation with a given source.
func (op ALUOp) Op32(source Source) OpCode {
return OpCode(ALUClass).SetALUOp(op).SetSource(source)
}
// Reg32 emits `dst (op) src`, zeroing the upper 32 bit of dst.
func (op ALUOp) Reg32(dst, src Register) Instruction {
return Instruction{
OpCode: op.Op32(RegSource),
Dst: dst,
Src: src,
}
}
// Imm32 emits `dst (op) value`, zeroing the upper 32 bit of dst.
func (op ALUOp) Imm32(dst Register, value int32) Instruction {
return Instruction{
OpCode: op.Op32(ImmSource),
Dst: dst,
Constant: int64(value),
}
}

View File

@ -0,0 +1,107 @@
// Code generated by "stringer -output alu_string.go -type=Source,Endianness,ALUOp"; DO NOT EDIT.
package asm
import "strconv"
func _() {
// An "invalid array index" compiler error signifies that the constant values have changed.
// Re-run the stringer command to generate them again.
var x [1]struct{}
_ = x[InvalidSource-255]
_ = x[ImmSource-0]
_ = x[RegSource-8]
}
const (
_Source_name_0 = "ImmSource"
_Source_name_1 = "RegSource"
_Source_name_2 = "InvalidSource"
)
func (i Source) String() string {
switch {
case i == 0:
return _Source_name_0
case i == 8:
return _Source_name_1
case i == 255:
return _Source_name_2
default:
return "Source(" + strconv.FormatInt(int64(i), 10) + ")"
}
}
func _() {
// An "invalid array index" compiler error signifies that the constant values have changed.
// Re-run the stringer command to generate them again.
var x [1]struct{}
_ = x[InvalidEndian-255]
_ = x[LE-0]
_ = x[BE-8]
}
const (
_Endianness_name_0 = "LE"
_Endianness_name_1 = "BE"
_Endianness_name_2 = "InvalidEndian"
)
func (i Endianness) String() string {
switch {
case i == 0:
return _Endianness_name_0
case i == 8:
return _Endianness_name_1
case i == 255:
return _Endianness_name_2
default:
return "Endianness(" + strconv.FormatInt(int64(i), 10) + ")"
}
}
func _() {
// An "invalid array index" compiler error signifies that the constant values have changed.
// Re-run the stringer command to generate them again.
var x [1]struct{}
_ = x[InvalidALUOp-255]
_ = x[Add-0]
_ = x[Sub-16]
_ = x[Mul-32]
_ = x[Div-48]
_ = x[Or-64]
_ = x[And-80]
_ = x[LSh-96]
_ = x[RSh-112]
_ = x[Neg-128]
_ = x[Mod-144]
_ = x[Xor-160]
_ = x[Mov-176]
_ = x[ArSh-192]
_ = x[Swap-208]
}
const _ALUOp_name = "AddSubMulDivOrAndLShRShNegModXorMovArShSwapInvalidALUOp"
var _ALUOp_map = map[ALUOp]string{
0: _ALUOp_name[0:3],
16: _ALUOp_name[3:6],
32: _ALUOp_name[6:9],
48: _ALUOp_name[9:12],
64: _ALUOp_name[12:14],
80: _ALUOp_name[14:17],
96: _ALUOp_name[17:20],
112: _ALUOp_name[20:23],
128: _ALUOp_name[23:26],
144: _ALUOp_name[26:29],
160: _ALUOp_name[29:32],
176: _ALUOp_name[32:35],
192: _ALUOp_name[35:39],
208: _ALUOp_name[39:43],
255: _ALUOp_name[43:55],
}
func (i ALUOp) String() string {
if str, ok := _ALUOp_map[i]; ok {
return str
}
return "ALUOp(" + strconv.FormatInt(int64(i), 10) + ")"
}

2
src/runtime/vendor/github.com/cilium/ebpf/asm/doc.go generated vendored Normal file
View File

@ -0,0 +1,2 @@
// Package asm is an assembler for eBPF bytecode.
package asm

195
src/runtime/vendor/github.com/cilium/ebpf/asm/func.go generated vendored Normal file
View File

@ -0,0 +1,195 @@
package asm
//go:generate stringer -output func_string.go -type=BuiltinFunc
// BuiltinFunc is a built-in eBPF function.
type BuiltinFunc int32
// eBPF built-in functions
//
// You can regenerate this list using the following gawk script:
//
// /FN\(.+\),/ {
// match($1, /\((.+)\)/, r)
// split(r[1], p, "_")
// printf "Fn"
// for (i in p) {
// printf "%s%s", toupper(substr(p[i], 1, 1)), substr(p[i], 2)
// }
// print ""
// }
//
// The script expects include/uapi/linux/bpf.h as it's input.
const (
FnUnspec BuiltinFunc = iota
FnMapLookupElem
FnMapUpdateElem
FnMapDeleteElem
FnProbeRead
FnKtimeGetNs
FnTracePrintk
FnGetPrandomU32
FnGetSmpProcessorId
FnSkbStoreBytes
FnL3CsumReplace
FnL4CsumReplace
FnTailCall
FnCloneRedirect
FnGetCurrentPidTgid
FnGetCurrentUidGid
FnGetCurrentComm
FnGetCgroupClassid
FnSkbVlanPush
FnSkbVlanPop
FnSkbGetTunnelKey
FnSkbSetTunnelKey
FnPerfEventRead
FnRedirect
FnGetRouteRealm
FnPerfEventOutput
FnSkbLoadBytes
FnGetStackid
FnCsumDiff
FnSkbGetTunnelOpt
FnSkbSetTunnelOpt
FnSkbChangeProto
FnSkbChangeType
FnSkbUnderCgroup
FnGetHashRecalc
FnGetCurrentTask
FnProbeWriteUser
FnCurrentTaskUnderCgroup
FnSkbChangeTail
FnSkbPullData
FnCsumUpdate
FnSetHashInvalid
FnGetNumaNodeId
FnSkbChangeHead
FnXdpAdjustHead
FnProbeReadStr
FnGetSocketCookie
FnGetSocketUid
FnSetHash
FnSetsockopt
FnSkbAdjustRoom
FnRedirectMap
FnSkRedirectMap
FnSockMapUpdate
FnXdpAdjustMeta
FnPerfEventReadValue
FnPerfProgReadValue
FnGetsockopt
FnOverrideReturn
FnSockOpsCbFlagsSet
FnMsgRedirectMap
FnMsgApplyBytes
FnMsgCorkBytes
FnMsgPullData
FnBind
FnXdpAdjustTail
FnSkbGetXfrmState
FnGetStack
FnSkbLoadBytesRelative
FnFibLookup
FnSockHashUpdate
FnMsgRedirectHash
FnSkRedirectHash
FnLwtPushEncap
FnLwtSeg6StoreBytes
FnLwtSeg6AdjustSrh
FnLwtSeg6Action
FnRcRepeat
FnRcKeydown
FnSkbCgroupId
FnGetCurrentCgroupId
FnGetLocalStorage
FnSkSelectReuseport
FnSkbAncestorCgroupId
FnSkLookupTcp
FnSkLookupUdp
FnSkRelease
FnMapPushElem
FnMapPopElem
FnMapPeekElem
FnMsgPushData
FnMsgPopData
FnRcPointerRel
FnSpinLock
FnSpinUnlock
FnSkFullsock
FnTcpSock
FnSkbEcnSetCe
FnGetListenerSock
FnSkcLookupTcp
FnTcpCheckSyncookie
FnSysctlGetName
FnSysctlGetCurrentValue
FnSysctlGetNewValue
FnSysctlSetNewValue
FnStrtol
FnStrtoul
FnSkStorageGet
FnSkStorageDelete
FnSendSignal
FnTcpGenSyncookie
FnSkbOutput
FnProbeReadUser
FnProbeReadKernel
FnProbeReadUserStr
FnProbeReadKernelStr
FnTcpSendAck
FnSendSignalThread
FnJiffies64
FnReadBranchRecords
FnGetNsCurrentPidTgid
FnXdpOutput
FnGetNetnsCookie
FnGetCurrentAncestorCgroupId
FnSkAssign
FnKtimeGetBootNs
FnSeqPrintf
FnSeqWrite
FnSkCgroupId
FnSkAncestorCgroupId
FnRingbufOutput
FnRingbufReserve
FnRingbufSubmit
FnRingbufDiscard
FnRingbufQuery
FnCsumLevel
FnSkcToTcp6Sock
FnSkcToTcpSock
FnSkcToTcpTimewaitSock
FnSkcToTcpRequestSock
FnSkcToUdp6Sock
FnGetTaskStack
FnLoadHdrOpt
FnStoreHdrOpt
FnReserveHdrOpt
FnInodeStorageGet
FnInodeStorageDelete
FnDPath
FnCopyFromUser
FnSnprintfBtf
FnSeqPrintfBtf
FnSkbCgroupClassid
FnRedirectNeigh
FnPerCpuPtr
FnThisCpuPtr
FnRedirectPeer
FnTaskStorageGet
FnTaskStorageDelete
FnGetCurrentTaskBtf
FnBprmOptsSet
FnKtimeGetCoarseNs
FnImaInodeHash
FnSockFromFile
)
// Call emits a function call.
func (fn BuiltinFunc) Call() Instruction {
return Instruction{
OpCode: OpCode(JumpClass).SetJumpOp(Call),
Constant: int64(fn),
}
}

View File

@ -0,0 +1,185 @@
// Code generated by "stringer -output func_string.go -type=BuiltinFunc"; DO NOT EDIT.
package asm
import "strconv"
func _() {
// An "invalid array index" compiler error signifies that the constant values have changed.
// Re-run the stringer command to generate them again.
var x [1]struct{}
_ = x[FnUnspec-0]
_ = x[FnMapLookupElem-1]
_ = x[FnMapUpdateElem-2]
_ = x[FnMapDeleteElem-3]
_ = x[FnProbeRead-4]
_ = x[FnKtimeGetNs-5]
_ = x[FnTracePrintk-6]
_ = x[FnGetPrandomU32-7]
_ = x[FnGetSmpProcessorId-8]
_ = x[FnSkbStoreBytes-9]
_ = x[FnL3CsumReplace-10]
_ = x[FnL4CsumReplace-11]
_ = x[FnTailCall-12]
_ = x[FnCloneRedirect-13]
_ = x[FnGetCurrentPidTgid-14]
_ = x[FnGetCurrentUidGid-15]
_ = x[FnGetCurrentComm-16]
_ = x[FnGetCgroupClassid-17]
_ = x[FnSkbVlanPush-18]
_ = x[FnSkbVlanPop-19]
_ = x[FnSkbGetTunnelKey-20]
_ = x[FnSkbSetTunnelKey-21]
_ = x[FnPerfEventRead-22]
_ = x[FnRedirect-23]
_ = x[FnGetRouteRealm-24]
_ = x[FnPerfEventOutput-25]
_ = x[FnSkbLoadBytes-26]
_ = x[FnGetStackid-27]
_ = x[FnCsumDiff-28]
_ = x[FnSkbGetTunnelOpt-29]
_ = x[FnSkbSetTunnelOpt-30]
_ = x[FnSkbChangeProto-31]
_ = x[FnSkbChangeType-32]
_ = x[FnSkbUnderCgroup-33]
_ = x[FnGetHashRecalc-34]
_ = x[FnGetCurrentTask-35]
_ = x[FnProbeWriteUser-36]
_ = x[FnCurrentTaskUnderCgroup-37]
_ = x[FnSkbChangeTail-38]
_ = x[FnSkbPullData-39]
_ = x[FnCsumUpdate-40]
_ = x[FnSetHashInvalid-41]
_ = x[FnGetNumaNodeId-42]
_ = x[FnSkbChangeHead-43]
_ = x[FnXdpAdjustHead-44]
_ = x[FnProbeReadStr-45]
_ = x[FnGetSocketCookie-46]
_ = x[FnGetSocketUid-47]
_ = x[FnSetHash-48]
_ = x[FnSetsockopt-49]
_ = x[FnSkbAdjustRoom-50]
_ = x[FnRedirectMap-51]
_ = x[FnSkRedirectMap-52]
_ = x[FnSockMapUpdate-53]
_ = x[FnXdpAdjustMeta-54]
_ = x[FnPerfEventReadValue-55]
_ = x[FnPerfProgReadValue-56]
_ = x[FnGetsockopt-57]
_ = x[FnOverrideReturn-58]
_ = x[FnSockOpsCbFlagsSet-59]
_ = x[FnMsgRedirectMap-60]
_ = x[FnMsgApplyBytes-61]
_ = x[FnMsgCorkBytes-62]
_ = x[FnMsgPullData-63]
_ = x[FnBind-64]
_ = x[FnXdpAdjustTail-65]
_ = x[FnSkbGetXfrmState-66]
_ = x[FnGetStack-67]
_ = x[FnSkbLoadBytesRelative-68]
_ = x[FnFibLookup-69]
_ = x[FnSockHashUpdate-70]
_ = x[FnMsgRedirectHash-71]
_ = x[FnSkRedirectHash-72]
_ = x[FnLwtPushEncap-73]
_ = x[FnLwtSeg6StoreBytes-74]
_ = x[FnLwtSeg6AdjustSrh-75]
_ = x[FnLwtSeg6Action-76]
_ = x[FnRcRepeat-77]
_ = x[FnRcKeydown-78]
_ = x[FnSkbCgroupId-79]
_ = x[FnGetCurrentCgroupId-80]
_ = x[FnGetLocalStorage-81]
_ = x[FnSkSelectReuseport-82]
_ = x[FnSkbAncestorCgroupId-83]
_ = x[FnSkLookupTcp-84]
_ = x[FnSkLookupUdp-85]
_ = x[FnSkRelease-86]
_ = x[FnMapPushElem-87]
_ = x[FnMapPopElem-88]
_ = x[FnMapPeekElem-89]
_ = x[FnMsgPushData-90]
_ = x[FnMsgPopData-91]
_ = x[FnRcPointerRel-92]
_ = x[FnSpinLock-93]
_ = x[FnSpinUnlock-94]
_ = x[FnSkFullsock-95]
_ = x[FnTcpSock-96]
_ = x[FnSkbEcnSetCe-97]
_ = x[FnGetListenerSock-98]
_ = x[FnSkcLookupTcp-99]
_ = x[FnTcpCheckSyncookie-100]
_ = x[FnSysctlGetName-101]
_ = x[FnSysctlGetCurrentValue-102]
_ = x[FnSysctlGetNewValue-103]
_ = x[FnSysctlSetNewValue-104]
_ = x[FnStrtol-105]
_ = x[FnStrtoul-106]
_ = x[FnSkStorageGet-107]
_ = x[FnSkStorageDelete-108]
_ = x[FnSendSignal-109]
_ = x[FnTcpGenSyncookie-110]
_ = x[FnSkbOutput-111]
_ = x[FnProbeReadUser-112]
_ = x[FnProbeReadKernel-113]
_ = x[FnProbeReadUserStr-114]
_ = x[FnProbeReadKernelStr-115]
_ = x[FnTcpSendAck-116]
_ = x[FnSendSignalThread-117]
_ = x[FnJiffies64-118]
_ = x[FnReadBranchRecords-119]
_ = x[FnGetNsCurrentPidTgid-120]
_ = x[FnXdpOutput-121]
_ = x[FnGetNetnsCookie-122]
_ = x[FnGetCurrentAncestorCgroupId-123]
_ = x[FnSkAssign-124]
_ = x[FnKtimeGetBootNs-125]
_ = x[FnSeqPrintf-126]
_ = x[FnSeqWrite-127]
_ = x[FnSkCgroupId-128]
_ = x[FnSkAncestorCgroupId-129]
_ = x[FnRingbufOutput-130]
_ = x[FnRingbufReserve-131]
_ = x[FnRingbufSubmit-132]
_ = x[FnRingbufDiscard-133]
_ = x[FnRingbufQuery-134]
_ = x[FnCsumLevel-135]
_ = x[FnSkcToTcp6Sock-136]
_ = x[FnSkcToTcpSock-137]
_ = x[FnSkcToTcpTimewaitSock-138]
_ = x[FnSkcToTcpRequestSock-139]
_ = x[FnSkcToUdp6Sock-140]
_ = x[FnGetTaskStack-141]
_ = x[FnLoadHdrOpt-142]
_ = x[FnStoreHdrOpt-143]
_ = x[FnReserveHdrOpt-144]
_ = x[FnInodeStorageGet-145]
_ = x[FnInodeStorageDelete-146]
_ = x[FnDPath-147]
_ = x[FnCopyFromUser-148]
_ = x[FnSnprintfBtf-149]
_ = x[FnSeqPrintfBtf-150]
_ = x[FnSkbCgroupClassid-151]
_ = x[FnRedirectNeigh-152]
_ = x[FnPerCpuPtr-153]
_ = x[FnThisCpuPtr-154]
_ = x[FnRedirectPeer-155]
_ = x[FnTaskStorageGet-156]
_ = x[FnTaskStorageDelete-157]
_ = x[FnGetCurrentTaskBtf-158]
_ = x[FnBprmOptsSet-159]
_ = x[FnKtimeGetCoarseNs-160]
_ = x[FnImaInodeHash-161]
_ = x[FnSockFromFile-162]
}
const _BuiltinFunc_name = "FnUnspecFnMapLookupElemFnMapUpdateElemFnMapDeleteElemFnProbeReadFnKtimeGetNsFnTracePrintkFnGetPrandomU32FnGetSmpProcessorIdFnSkbStoreBytesFnL3CsumReplaceFnL4CsumReplaceFnTailCallFnCloneRedirectFnGetCurrentPidTgidFnGetCurrentUidGidFnGetCurrentCommFnGetCgroupClassidFnSkbVlanPushFnSkbVlanPopFnSkbGetTunnelKeyFnSkbSetTunnelKeyFnPerfEventReadFnRedirectFnGetRouteRealmFnPerfEventOutputFnSkbLoadBytesFnGetStackidFnCsumDiffFnSkbGetTunnelOptFnSkbSetTunnelOptFnSkbChangeProtoFnSkbChangeTypeFnSkbUnderCgroupFnGetHashRecalcFnGetCurrentTaskFnProbeWriteUserFnCurrentTaskUnderCgroupFnSkbChangeTailFnSkbPullDataFnCsumUpdateFnSetHashInvalidFnGetNumaNodeIdFnSkbChangeHeadFnXdpAdjustHeadFnProbeReadStrFnGetSocketCookieFnGetSocketUidFnSetHashFnSetsockoptFnSkbAdjustRoomFnRedirectMapFnSkRedirectMapFnSockMapUpdateFnXdpAdjustMetaFnPerfEventReadValueFnPerfProgReadValueFnGetsockoptFnOverrideReturnFnSockOpsCbFlagsSetFnMsgRedirectMapFnMsgApplyBytesFnMsgCorkBytesFnMsgPullDataFnBindFnXdpAdjustTailFnSkbGetXfrmStateFnGetStackFnSkbLoadBytesRelativeFnFibLookupFnSockHashUpdateFnMsgRedirectHashFnSkRedirectHashFnLwtPushEncapFnLwtSeg6StoreBytesFnLwtSeg6AdjustSrhFnLwtSeg6ActionFnRcRepeatFnRcKeydownFnSkbCgroupIdFnGetCurrentCgroupIdFnGetLocalStorageFnSkSelectReuseportFnSkbAncestorCgroupIdFnSkLookupTcpFnSkLookupUdpFnSkReleaseFnMapPushElemFnMapPopElemFnMapPeekElemFnMsgPushDataFnMsgPopDataFnRcPointerRelFnSpinLockFnSpinUnlockFnSkFullsockFnTcpSockFnSkbEcnSetCeFnGetListenerSockFnSkcLookupTcpFnTcpCheckSyncookieFnSysctlGetNameFnSysctlGetCurrentValueFnSysctlGetNewValueFnSysctlSetNewValueFnStrtolFnStrtoulFnSkStorageGetFnSkStorageDeleteFnSendSignalFnTcpGenSyncookieFnSkbOutputFnProbeReadUserFnProbeReadKernelFnProbeReadUserStrFnProbeReadKernelStrFnTcpSendAckFnSendSignalThreadFnJiffies64FnReadBranchRecordsFnGetNsCurrentPidTgidFnXdpOutputFnGetNetnsCookieFnGetCurrentAncestorCgroupIdFnSkAssignFnKtimeGetBootNsFnSeqPrintfFnSeqWriteFnSkCgroupIdFnSkAncestorCgroupIdFnRingbufOutputFnRingbufReserveFnRingbufSubmitFnRingbufDiscardFnRingbufQueryFnCsumLevelFnSkcToTcp6SockFnSkcToTcpSockFnSkcToTcpTimewaitSockFnSkcToTcpRequestSockFnSkcToUdp6SockFnGetTaskStackFnLoadHdrOptFnStoreHdrOptFnReserveHdrOptFnInodeStorageGetFnInodeStorageDeleteFnDPathFnCopyFromUserFnSnprintfBtfFnSeqPrintfBtfFnSkbCgroupClassidFnRedirectNeighFnPerCpuPtrFnThisCpuPtrFnRedirectPeerFnTaskStorageGetFnTaskStorageDeleteFnGetCurrentTaskBtfFnBprmOptsSetFnKtimeGetCoarseNsFnImaInodeHashFnSockFromFile"
var _BuiltinFunc_index = [...]uint16{0, 8, 23, 38, 53, 64, 76, 89, 104, 123, 138, 153, 168, 178, 193, 212, 230, 246, 264, 277, 289, 306, 323, 338, 348, 363, 380, 394, 406, 416, 433, 450, 466, 481, 497, 512, 528, 544, 568, 583, 596, 608, 624, 639, 654, 669, 683, 700, 714, 723, 735, 750, 763, 778, 793, 808, 828, 847, 859, 875, 894, 910, 925, 939, 952, 958, 973, 990, 1000, 1022, 1033, 1049, 1066, 1082, 1096, 1115, 1133, 1148, 1158, 1169, 1182, 1202, 1219, 1238, 1259, 1272, 1285, 1296, 1309, 1321, 1334, 1347, 1359, 1373, 1383, 1395, 1407, 1416, 1429, 1446, 1460, 1479, 1494, 1517, 1536, 1555, 1563, 1572, 1586, 1603, 1615, 1632, 1643, 1658, 1675, 1693, 1713, 1725, 1743, 1754, 1773, 1794, 1805, 1821, 1849, 1859, 1875, 1886, 1896, 1908, 1928, 1943, 1959, 1974, 1990, 2004, 2015, 2030, 2044, 2066, 2087, 2102, 2116, 2128, 2141, 2156, 2173, 2193, 2200, 2214, 2227, 2241, 2259, 2274, 2285, 2297, 2311, 2327, 2346, 2365, 2378, 2396, 2410, 2424}
func (i BuiltinFunc) String() string {
if i < 0 || i >= BuiltinFunc(len(_BuiltinFunc_index)-1) {
return "BuiltinFunc(" + strconv.FormatInt(int64(i), 10) + ")"
}
return _BuiltinFunc_name[_BuiltinFunc_index[i]:_BuiltinFunc_index[i+1]]
}

View File

@ -0,0 +1,506 @@
package asm
import (
"crypto/sha1"
"encoding/binary"
"encoding/hex"
"errors"
"fmt"
"io"
"math"
"strings"
"github.com/cilium/ebpf/internal/unix"
)
// InstructionSize is the size of a BPF instruction in bytes
const InstructionSize = 8
// RawInstructionOffset is an offset in units of raw BPF instructions.
type RawInstructionOffset uint64
// Bytes returns the offset of an instruction in bytes.
func (rio RawInstructionOffset) Bytes() uint64 {
return uint64(rio) * InstructionSize
}
// Instruction is a single eBPF instruction.
type Instruction struct {
OpCode OpCode
Dst Register
Src Register
Offset int16
Constant int64
Reference string
Symbol string
}
// Sym creates a symbol.
func (ins Instruction) Sym(name string) Instruction {
ins.Symbol = name
return ins
}
// Unmarshal decodes a BPF instruction.
func (ins *Instruction) Unmarshal(r io.Reader, bo binary.ByteOrder) (uint64, error) {
var bi bpfInstruction
err := binary.Read(r, bo, &bi)
if err != nil {
return 0, err
}
ins.OpCode = bi.OpCode
ins.Offset = bi.Offset
ins.Constant = int64(bi.Constant)
ins.Dst, ins.Src, err = bi.Registers.Unmarshal(bo)
if err != nil {
return 0, fmt.Errorf("can't unmarshal registers: %s", err)
}
if !bi.OpCode.IsDWordLoad() {
return InstructionSize, nil
}
var bi2 bpfInstruction
if err := binary.Read(r, bo, &bi2); err != nil {
// No Wrap, to avoid io.EOF clash
return 0, errors.New("64bit immediate is missing second half")
}
if bi2.OpCode != 0 || bi2.Offset != 0 || bi2.Registers != 0 {
return 0, errors.New("64bit immediate has non-zero fields")
}
ins.Constant = int64(uint64(uint32(bi2.Constant))<<32 | uint64(uint32(bi.Constant)))
return 2 * InstructionSize, nil
}
// Marshal encodes a BPF instruction.
func (ins Instruction) Marshal(w io.Writer, bo binary.ByteOrder) (uint64, error) {
if ins.OpCode == InvalidOpCode {
return 0, errors.New("invalid opcode")
}
isDWordLoad := ins.OpCode.IsDWordLoad()
cons := int32(ins.Constant)
if isDWordLoad {
// Encode least significant 32bit first for 64bit operations.
cons = int32(uint32(ins.Constant))
}
regs, err := newBPFRegisters(ins.Dst, ins.Src, bo)
if err != nil {
return 0, fmt.Errorf("can't marshal registers: %s", err)
}
bpfi := bpfInstruction{
ins.OpCode,
regs,
ins.Offset,
cons,
}
if err := binary.Write(w, bo, &bpfi); err != nil {
return 0, err
}
if !isDWordLoad {
return InstructionSize, nil
}
bpfi = bpfInstruction{
Constant: int32(ins.Constant >> 32),
}
if err := binary.Write(w, bo, &bpfi); err != nil {
return 0, err
}
return 2 * InstructionSize, nil
}
// RewriteMapPtr changes an instruction to use a new map fd.
//
// Returns an error if the instruction doesn't load a map.
func (ins *Instruction) RewriteMapPtr(fd int) error {
if !ins.OpCode.IsDWordLoad() {
return fmt.Errorf("%s is not a 64 bit load", ins.OpCode)
}
if ins.Src != PseudoMapFD && ins.Src != PseudoMapValue {
return errors.New("not a load from a map")
}
// Preserve the offset value for direct map loads.
offset := uint64(ins.Constant) & (math.MaxUint32 << 32)
rawFd := uint64(uint32(fd))
ins.Constant = int64(offset | rawFd)
return nil
}
// MapPtr returns the map fd for this instruction.
//
// The result is undefined if the instruction is not a load from a map,
// see IsLoadFromMap.
func (ins *Instruction) MapPtr() int {
return int(int32(uint64(ins.Constant) & math.MaxUint32))
}
// RewriteMapOffset changes the offset of a direct load from a map.
//
// Returns an error if the instruction is not a direct load.
func (ins *Instruction) RewriteMapOffset(offset uint32) error {
if !ins.OpCode.IsDWordLoad() {
return fmt.Errorf("%s is not a 64 bit load", ins.OpCode)
}
if ins.Src != PseudoMapValue {
return errors.New("not a direct load from a map")
}
fd := uint64(ins.Constant) & math.MaxUint32
ins.Constant = int64(uint64(offset)<<32 | fd)
return nil
}
func (ins *Instruction) mapOffset() uint32 {
return uint32(uint64(ins.Constant) >> 32)
}
// IsLoadFromMap returns true if the instruction loads from a map.
//
// This covers both loading the map pointer and direct map value loads.
func (ins *Instruction) IsLoadFromMap() bool {
return ins.OpCode == LoadImmOp(DWord) && (ins.Src == PseudoMapFD || ins.Src == PseudoMapValue)
}
// IsFunctionCall returns true if the instruction calls another BPF function.
//
// This is not the same thing as a BPF helper call.
func (ins *Instruction) IsFunctionCall() bool {
return ins.OpCode.JumpOp() == Call && ins.Src == PseudoCall
}
// IsConstantLoad returns true if the instruction loads a constant of the
// given size.
func (ins *Instruction) IsConstantLoad(size Size) bool {
return ins.OpCode == LoadImmOp(size) && ins.Src == R0 && ins.Offset == 0
}
// Format implements fmt.Formatter.
func (ins Instruction) Format(f fmt.State, c rune) {
if c != 'v' {
fmt.Fprintf(f, "{UNRECOGNIZED: %c}", c)
return
}
op := ins.OpCode
if op == InvalidOpCode {
fmt.Fprint(f, "INVALID")
return
}
// Omit trailing space for Exit
if op.JumpOp() == Exit {
fmt.Fprint(f, op)
return
}
if ins.IsLoadFromMap() {
fd := ins.MapPtr()
switch ins.Src {
case PseudoMapFD:
fmt.Fprintf(f, "LoadMapPtr dst: %s fd: %d", ins.Dst, fd)
case PseudoMapValue:
fmt.Fprintf(f, "LoadMapValue dst: %s, fd: %d off: %d", ins.Dst, fd, ins.mapOffset())
}
goto ref
}
fmt.Fprintf(f, "%v ", op)
switch cls := op.Class(); cls {
case LdClass, LdXClass, StClass, StXClass:
switch op.Mode() {
case ImmMode:
fmt.Fprintf(f, "dst: %s imm: %d", ins.Dst, ins.Constant)
case AbsMode:
fmt.Fprintf(f, "imm: %d", ins.Constant)
case IndMode:
fmt.Fprintf(f, "dst: %s src: %s imm: %d", ins.Dst, ins.Src, ins.Constant)
case MemMode:
fmt.Fprintf(f, "dst: %s src: %s off: %d imm: %d", ins.Dst, ins.Src, ins.Offset, ins.Constant)
case XAddMode:
fmt.Fprintf(f, "dst: %s src: %s", ins.Dst, ins.Src)
}
case ALU64Class, ALUClass:
fmt.Fprintf(f, "dst: %s ", ins.Dst)
if op.ALUOp() == Swap || op.Source() == ImmSource {
fmt.Fprintf(f, "imm: %d", ins.Constant)
} else {
fmt.Fprintf(f, "src: %s", ins.Src)
}
case JumpClass:
switch jop := op.JumpOp(); jop {
case Call:
if ins.Src == PseudoCall {
// bpf-to-bpf call
fmt.Fprint(f, ins.Constant)
} else {
fmt.Fprint(f, BuiltinFunc(ins.Constant))
}
default:
fmt.Fprintf(f, "dst: %s off: %d ", ins.Dst, ins.Offset)
if op.Source() == ImmSource {
fmt.Fprintf(f, "imm: %d", ins.Constant)
} else {
fmt.Fprintf(f, "src: %s", ins.Src)
}
}
}
ref:
if ins.Reference != "" {
fmt.Fprintf(f, " <%s>", ins.Reference)
}
}
// Instructions is an eBPF program.
type Instructions []Instruction
func (insns Instructions) String() string {
return fmt.Sprint(insns)
}
// RewriteMapPtr rewrites all loads of a specific map pointer to a new fd.
//
// Returns an error if the symbol isn't used, see IsUnreferencedSymbol.
func (insns Instructions) RewriteMapPtr(symbol string, fd int) error {
if symbol == "" {
return errors.New("empty symbol")
}
found := false
for i := range insns {
ins := &insns[i]
if ins.Reference != symbol {
continue
}
if err := ins.RewriteMapPtr(fd); err != nil {
return err
}
found = true
}
if !found {
return &unreferencedSymbolError{symbol}
}
return nil
}
// SymbolOffsets returns the set of symbols and their offset in
// the instructions.
func (insns Instructions) SymbolOffsets() (map[string]int, error) {
offsets := make(map[string]int)
for i, ins := range insns {
if ins.Symbol == "" {
continue
}
if _, ok := offsets[ins.Symbol]; ok {
return nil, fmt.Errorf("duplicate symbol %s", ins.Symbol)
}
offsets[ins.Symbol] = i
}
return offsets, nil
}
// ReferenceOffsets returns the set of references and their offset in
// the instructions.
func (insns Instructions) ReferenceOffsets() map[string][]int {
offsets := make(map[string][]int)
for i, ins := range insns {
if ins.Reference == "" {
continue
}
offsets[ins.Reference] = append(offsets[ins.Reference], i)
}
return offsets
}
// Format implements fmt.Formatter.
//
// You can control indentation of symbols by
// specifying a width. Setting a precision controls the indentation of
// instructions.
// The default character is a tab, which can be overridden by specifying
// the ' ' space flag.
func (insns Instructions) Format(f fmt.State, c rune) {
if c != 's' && c != 'v' {
fmt.Fprintf(f, "{UNKNOWN FORMAT '%c'}", c)
return
}
// Precision is better in this case, because it allows
// specifying 0 padding easily.
padding, ok := f.Precision()
if !ok {
padding = 1
}
indent := strings.Repeat("\t", padding)
if f.Flag(' ') {
indent = strings.Repeat(" ", padding)
}
symPadding, ok := f.Width()
if !ok {
symPadding = padding - 1
}
if symPadding < 0 {
symPadding = 0
}
symIndent := strings.Repeat("\t", symPadding)
if f.Flag(' ') {
symIndent = strings.Repeat(" ", symPadding)
}
// Guess how many digits we need at most, by assuming that all instructions
// are double wide.
highestOffset := len(insns) * 2
offsetWidth := int(math.Ceil(math.Log10(float64(highestOffset))))
iter := insns.Iterate()
for iter.Next() {
if iter.Ins.Symbol != "" {
fmt.Fprintf(f, "%s%s:\n", symIndent, iter.Ins.Symbol)
}
fmt.Fprintf(f, "%s%*d: %v\n", indent, offsetWidth, iter.Offset, iter.Ins)
}
}
// Marshal encodes a BPF program into the kernel format.
func (insns Instructions) Marshal(w io.Writer, bo binary.ByteOrder) error {
for i, ins := range insns {
_, err := ins.Marshal(w, bo)
if err != nil {
return fmt.Errorf("instruction %d: %w", i, err)
}
}
return nil
}
// Tag calculates the kernel tag for a series of instructions.
//
// It mirrors bpf_prog_calc_tag in the kernel and so can be compared
// to ProgramInfo.Tag to figure out whether a loaded program matches
// certain instructions.
func (insns Instructions) Tag(bo binary.ByteOrder) (string, error) {
h := sha1.New()
for i, ins := range insns {
if ins.IsLoadFromMap() {
ins.Constant = 0
}
_, err := ins.Marshal(h, bo)
if err != nil {
return "", fmt.Errorf("instruction %d: %w", i, err)
}
}
return hex.EncodeToString(h.Sum(nil)[:unix.BPF_TAG_SIZE]), nil
}
// Iterate allows iterating a BPF program while keeping track of
// various offsets.
//
// Modifying the instruction slice will lead to undefined behaviour.
func (insns Instructions) Iterate() *InstructionIterator {
return &InstructionIterator{insns: insns}
}
// InstructionIterator iterates over a BPF program.
type InstructionIterator struct {
insns Instructions
// The instruction in question.
Ins *Instruction
// The index of the instruction in the original instruction slice.
Index int
// The offset of the instruction in raw BPF instructions. This accounts
// for double-wide instructions.
Offset RawInstructionOffset
}
// Next returns true as long as there are any instructions remaining.
func (iter *InstructionIterator) Next() bool {
if len(iter.insns) == 0 {
return false
}
if iter.Ins != nil {
iter.Index++
iter.Offset += RawInstructionOffset(iter.Ins.OpCode.rawInstructions())
}
iter.Ins = &iter.insns[0]
iter.insns = iter.insns[1:]
return true
}
type bpfInstruction struct {
OpCode OpCode
Registers bpfRegisters
Offset int16
Constant int32
}
type bpfRegisters uint8
func newBPFRegisters(dst, src Register, bo binary.ByteOrder) (bpfRegisters, error) {
switch bo {
case binary.LittleEndian:
return bpfRegisters((src << 4) | (dst & 0xF)), nil
case binary.BigEndian:
return bpfRegisters((dst << 4) | (src & 0xF)), nil
default:
return 0, fmt.Errorf("unrecognized ByteOrder %T", bo)
}
}
func (r bpfRegisters) Unmarshal(bo binary.ByteOrder) (dst, src Register, err error) {
switch bo {
case binary.LittleEndian:
return Register(r & 0xF), Register(r >> 4), nil
case binary.BigEndian:
return Register(r >> 4), Register(r & 0xf), nil
default:
return 0, 0, fmt.Errorf("unrecognized ByteOrder %T", bo)
}
}
type unreferencedSymbolError struct {
symbol string
}
func (use *unreferencedSymbolError) Error() string {
return fmt.Sprintf("unreferenced symbol %s", use.symbol)
}
// IsUnreferencedSymbol returns true if err was caused by
// an unreferenced symbol.
func IsUnreferencedSymbol(err error) bool {
_, ok := err.(*unreferencedSymbolError)
return ok
}

109
src/runtime/vendor/github.com/cilium/ebpf/asm/jump.go generated vendored Normal file
View File

@ -0,0 +1,109 @@
package asm
//go:generate stringer -output jump_string.go -type=JumpOp
// JumpOp affect control flow.
//
// msb lsb
// +----+-+---+
// |OP |s|cls|
// +----+-+---+
type JumpOp uint8
const jumpMask OpCode = aluMask
const (
// InvalidJumpOp is returned by getters when invoked
// on non branch OpCodes
InvalidJumpOp JumpOp = 0xff
// Ja jumps by offset unconditionally
Ja JumpOp = 0x00
// JEq jumps by offset if r == imm
JEq JumpOp = 0x10
// JGT jumps by offset if r > imm
JGT JumpOp = 0x20
// JGE jumps by offset if r >= imm
JGE JumpOp = 0x30
// JSet jumps by offset if r & imm
JSet JumpOp = 0x40
// JNE jumps by offset if r != imm
JNE JumpOp = 0x50
// JSGT jumps by offset if signed r > signed imm
JSGT JumpOp = 0x60
// JSGE jumps by offset if signed r >= signed imm
JSGE JumpOp = 0x70
// Call builtin or user defined function from imm
Call JumpOp = 0x80
// Exit ends execution, with value in r0
Exit JumpOp = 0x90
// JLT jumps by offset if r < imm
JLT JumpOp = 0xa0
// JLE jumps by offset if r <= imm
JLE JumpOp = 0xb0
// JSLT jumps by offset if signed r < signed imm
JSLT JumpOp = 0xc0
// JSLE jumps by offset if signed r <= signed imm
JSLE JumpOp = 0xd0
)
// Return emits an exit instruction.
//
// Requires a return value in R0.
func Return() Instruction {
return Instruction{
OpCode: OpCode(JumpClass).SetJumpOp(Exit),
}
}
// Op returns the OpCode for a given jump source.
func (op JumpOp) Op(source Source) OpCode {
return OpCode(JumpClass).SetJumpOp(op).SetSource(source)
}
// Imm compares dst to value, and adjusts PC by offset if the condition is fulfilled.
func (op JumpOp) Imm(dst Register, value int32, label string) Instruction {
if op == Exit || op == Call || op == Ja {
return Instruction{OpCode: InvalidOpCode}
}
return Instruction{
OpCode: OpCode(JumpClass).SetJumpOp(op).SetSource(ImmSource),
Dst: dst,
Offset: -1,
Constant: int64(value),
Reference: label,
}
}
// Reg compares dst to src, and adjusts PC by offset if the condition is fulfilled.
func (op JumpOp) Reg(dst, src Register, label string) Instruction {
if op == Exit || op == Call || op == Ja {
return Instruction{OpCode: InvalidOpCode}
}
return Instruction{
OpCode: OpCode(JumpClass).SetJumpOp(op).SetSource(RegSource),
Dst: dst,
Src: src,
Offset: -1,
Reference: label,
}
}
// Label adjusts PC to the address of the label.
func (op JumpOp) Label(label string) Instruction {
if op == Call {
return Instruction{
OpCode: OpCode(JumpClass).SetJumpOp(Call),
Src: PseudoCall,
Constant: -1,
Reference: label,
}
}
return Instruction{
OpCode: OpCode(JumpClass).SetJumpOp(op),
Offset: -1,
Reference: label,
}
}

View File

@ -0,0 +1,53 @@
// Code generated by "stringer -output jump_string.go -type=JumpOp"; DO NOT EDIT.
package asm
import "strconv"
func _() {
// An "invalid array index" compiler error signifies that the constant values have changed.
// Re-run the stringer command to generate them again.
var x [1]struct{}
_ = x[InvalidJumpOp-255]
_ = x[Ja-0]
_ = x[JEq-16]
_ = x[JGT-32]
_ = x[JGE-48]
_ = x[JSet-64]
_ = x[JNE-80]
_ = x[JSGT-96]
_ = x[JSGE-112]
_ = x[Call-128]
_ = x[Exit-144]
_ = x[JLT-160]
_ = x[JLE-176]
_ = x[JSLT-192]
_ = x[JSLE-208]
}
const _JumpOp_name = "JaJEqJGTJGEJSetJNEJSGTJSGECallExitJLTJLEJSLTJSLEInvalidJumpOp"
var _JumpOp_map = map[JumpOp]string{
0: _JumpOp_name[0:2],
16: _JumpOp_name[2:5],
32: _JumpOp_name[5:8],
48: _JumpOp_name[8:11],
64: _JumpOp_name[11:15],
80: _JumpOp_name[15:18],
96: _JumpOp_name[18:22],
112: _JumpOp_name[22:26],
128: _JumpOp_name[26:30],
144: _JumpOp_name[30:34],
160: _JumpOp_name[34:37],
176: _JumpOp_name[37:40],
192: _JumpOp_name[40:44],
208: _JumpOp_name[44:48],
255: _JumpOp_name[48:61],
}
func (i JumpOp) String() string {
if str, ok := _JumpOp_map[i]; ok {
return str
}
return "JumpOp(" + strconv.FormatInt(int64(i), 10) + ")"
}

View File

@ -0,0 +1,204 @@
package asm
//go:generate stringer -output load_store_string.go -type=Mode,Size
// Mode for load and store operations
//
// msb lsb
// +---+--+---+
// |MDE|sz|cls|
// +---+--+---+
type Mode uint8
const modeMask OpCode = 0xe0
const (
// InvalidMode is returned by getters when invoked
// on non load / store OpCodes
InvalidMode Mode = 0xff
// ImmMode - immediate value
ImmMode Mode = 0x00
// AbsMode - immediate value + offset
AbsMode Mode = 0x20
// IndMode - indirect (imm+src)
IndMode Mode = 0x40
// MemMode - load from memory
MemMode Mode = 0x60
// XAddMode - add atomically across processors.
XAddMode Mode = 0xc0
)
// Size of load and store operations
//
// msb lsb
// +---+--+---+
// |mde|SZ|cls|
// +---+--+---+
type Size uint8
const sizeMask OpCode = 0x18
const (
// InvalidSize is returned by getters when invoked
// on non load / store OpCodes
InvalidSize Size = 0xff
// DWord - double word; 64 bits
DWord Size = 0x18
// Word - word; 32 bits
Word Size = 0x00
// Half - half-word; 16 bits
Half Size = 0x08
// Byte - byte; 8 bits
Byte Size = 0x10
)
// Sizeof returns the size in bytes.
func (s Size) Sizeof() int {
switch s {
case DWord:
return 8
case Word:
return 4
case Half:
return 2
case Byte:
return 1
default:
return -1
}
}
// LoadMemOp returns the OpCode to load a value of given size from memory.
func LoadMemOp(size Size) OpCode {
return OpCode(LdXClass).SetMode(MemMode).SetSize(size)
}
// LoadMem emits `dst = *(size *)(src + offset)`.
func LoadMem(dst, src Register, offset int16, size Size) Instruction {
return Instruction{
OpCode: LoadMemOp(size),
Dst: dst,
Src: src,
Offset: offset,
}
}
// LoadImmOp returns the OpCode to load an immediate of given size.
//
// As of kernel 4.20, only DWord size is accepted.
func LoadImmOp(size Size) OpCode {
return OpCode(LdClass).SetMode(ImmMode).SetSize(size)
}
// LoadImm emits `dst = (size)value`.
//
// As of kernel 4.20, only DWord size is accepted.
func LoadImm(dst Register, value int64, size Size) Instruction {
return Instruction{
OpCode: LoadImmOp(size),
Dst: dst,
Constant: value,
}
}
// LoadMapPtr stores a pointer to a map in dst.
func LoadMapPtr(dst Register, fd int) Instruction {
if fd < 0 {
return Instruction{OpCode: InvalidOpCode}
}
return Instruction{
OpCode: LoadImmOp(DWord),
Dst: dst,
Src: PseudoMapFD,
Constant: int64(uint32(fd)),
}
}
// LoadMapValue stores a pointer to the value at a certain offset of a map.
func LoadMapValue(dst Register, fd int, offset uint32) Instruction {
if fd < 0 {
return Instruction{OpCode: InvalidOpCode}
}
fdAndOffset := (uint64(offset) << 32) | uint64(uint32(fd))
return Instruction{
OpCode: LoadImmOp(DWord),
Dst: dst,
Src: PseudoMapValue,
Constant: int64(fdAndOffset),
}
}
// LoadIndOp returns the OpCode for loading a value of given size from an sk_buff.
func LoadIndOp(size Size) OpCode {
return OpCode(LdClass).SetMode(IndMode).SetSize(size)
}
// LoadInd emits `dst = ntoh(*(size *)(((sk_buff *)R6)->data + src + offset))`.
func LoadInd(dst, src Register, offset int32, size Size) Instruction {
return Instruction{
OpCode: LoadIndOp(size),
Dst: dst,
Src: src,
Constant: int64(offset),
}
}
// LoadAbsOp returns the OpCode for loading a value of given size from an sk_buff.
func LoadAbsOp(size Size) OpCode {
return OpCode(LdClass).SetMode(AbsMode).SetSize(size)
}
// LoadAbs emits `r0 = ntoh(*(size *)(((sk_buff *)R6)->data + offset))`.
func LoadAbs(offset int32, size Size) Instruction {
return Instruction{
OpCode: LoadAbsOp(size),
Dst: R0,
Constant: int64(offset),
}
}
// StoreMemOp returns the OpCode for storing a register of given size in memory.
func StoreMemOp(size Size) OpCode {
return OpCode(StXClass).SetMode(MemMode).SetSize(size)
}
// StoreMem emits `*(size *)(dst + offset) = src`
func StoreMem(dst Register, offset int16, src Register, size Size) Instruction {
return Instruction{
OpCode: StoreMemOp(size),
Dst: dst,
Src: src,
Offset: offset,
}
}
// StoreImmOp returns the OpCode for storing an immediate of given size in memory.
func StoreImmOp(size Size) OpCode {
return OpCode(StClass).SetMode(MemMode).SetSize(size)
}
// StoreImm emits `*(size *)(dst + offset) = value`.
func StoreImm(dst Register, offset int16, value int64, size Size) Instruction {
return Instruction{
OpCode: StoreImmOp(size),
Dst: dst,
Offset: offset,
Constant: value,
}
}
// StoreXAddOp returns the OpCode to atomically add a register to a value in memory.
func StoreXAddOp(size Size) OpCode {
return OpCode(StXClass).SetMode(XAddMode).SetSize(size)
}
// StoreXAdd atomically adds src to *dst.
func StoreXAdd(dst, src Register, size Size) Instruction {
return Instruction{
OpCode: StoreXAddOp(size),
Dst: dst,
Src: src,
}
}

View File

@ -0,0 +1,80 @@
// Code generated by "stringer -output load_store_string.go -type=Mode,Size"; DO NOT EDIT.
package asm
import "strconv"
func _() {
// An "invalid array index" compiler error signifies that the constant values have changed.
// Re-run the stringer command to generate them again.
var x [1]struct{}
_ = x[InvalidMode-255]
_ = x[ImmMode-0]
_ = x[AbsMode-32]
_ = x[IndMode-64]
_ = x[MemMode-96]
_ = x[XAddMode-192]
}
const (
_Mode_name_0 = "ImmMode"
_Mode_name_1 = "AbsMode"
_Mode_name_2 = "IndMode"
_Mode_name_3 = "MemMode"
_Mode_name_4 = "XAddMode"
_Mode_name_5 = "InvalidMode"
)
func (i Mode) String() string {
switch {
case i == 0:
return _Mode_name_0
case i == 32:
return _Mode_name_1
case i == 64:
return _Mode_name_2
case i == 96:
return _Mode_name_3
case i == 192:
return _Mode_name_4
case i == 255:
return _Mode_name_5
default:
return "Mode(" + strconv.FormatInt(int64(i), 10) + ")"
}
}
func _() {
// An "invalid array index" compiler error signifies that the constant values have changed.
// Re-run the stringer command to generate them again.
var x [1]struct{}
_ = x[InvalidSize-255]
_ = x[DWord-24]
_ = x[Word-0]
_ = x[Half-8]
_ = x[Byte-16]
}
const (
_Size_name_0 = "Word"
_Size_name_1 = "Half"
_Size_name_2 = "Byte"
_Size_name_3 = "DWord"
_Size_name_4 = "InvalidSize"
)
func (i Size) String() string {
switch {
case i == 0:
return _Size_name_0
case i == 8:
return _Size_name_1
case i == 16:
return _Size_name_2
case i == 24:
return _Size_name_3
case i == 255:
return _Size_name_4
default:
return "Size(" + strconv.FormatInt(int64(i), 10) + ")"
}
}

237
src/runtime/vendor/github.com/cilium/ebpf/asm/opcode.go generated vendored Normal file
View File

@ -0,0 +1,237 @@
package asm
import (
"fmt"
"strings"
)
//go:generate stringer -output opcode_string.go -type=Class
type encoding int
const (
unknownEncoding encoding = iota
loadOrStore
jumpOrALU
)
// Class of operations
//
// msb lsb
// +---+--+---+
// | ?? |CLS|
// +---+--+---+
type Class uint8
const classMask OpCode = 0x07
const (
// LdClass load memory
LdClass Class = 0x00
// LdXClass load memory from constant
LdXClass Class = 0x01
// StClass load register from memory
StClass Class = 0x02
// StXClass load register from constant
StXClass Class = 0x03
// ALUClass arithmetic operators
ALUClass Class = 0x04
// JumpClass jump operators
JumpClass Class = 0x05
// ALU64Class arithmetic in 64 bit mode
ALU64Class Class = 0x07
)
func (cls Class) encoding() encoding {
switch cls {
case LdClass, LdXClass, StClass, StXClass:
return loadOrStore
case ALU64Class, ALUClass, JumpClass:
return jumpOrALU
default:
return unknownEncoding
}
}
// OpCode is a packed eBPF opcode.
//
// Its encoding is defined by a Class value:
//
// msb lsb
// +----+-+---+
// | ???? |CLS|
// +----+-+---+
type OpCode uint8
// InvalidOpCode is returned by setters on OpCode
const InvalidOpCode OpCode = 0xff
// rawInstructions returns the number of BPF instructions required
// to encode this opcode.
func (op OpCode) rawInstructions() int {
if op.IsDWordLoad() {
return 2
}
return 1
}
func (op OpCode) IsDWordLoad() bool {
return op == LoadImmOp(DWord)
}
// Class returns the class of operation.
func (op OpCode) Class() Class {
return Class(op & classMask)
}
// Mode returns the mode for load and store operations.
func (op OpCode) Mode() Mode {
if op.Class().encoding() != loadOrStore {
return InvalidMode
}
return Mode(op & modeMask)
}
// Size returns the size for load and store operations.
func (op OpCode) Size() Size {
if op.Class().encoding() != loadOrStore {
return InvalidSize
}
return Size(op & sizeMask)
}
// Source returns the source for branch and ALU operations.
func (op OpCode) Source() Source {
if op.Class().encoding() != jumpOrALU || op.ALUOp() == Swap {
return InvalidSource
}
return Source(op & sourceMask)
}
// ALUOp returns the ALUOp.
func (op OpCode) ALUOp() ALUOp {
if op.Class().encoding() != jumpOrALU {
return InvalidALUOp
}
return ALUOp(op & aluMask)
}
// Endianness returns the Endianness for a byte swap instruction.
func (op OpCode) Endianness() Endianness {
if op.ALUOp() != Swap {
return InvalidEndian
}
return Endianness(op & endianMask)
}
// JumpOp returns the JumpOp.
func (op OpCode) JumpOp() JumpOp {
if op.Class().encoding() != jumpOrALU {
return InvalidJumpOp
}
return JumpOp(op & jumpMask)
}
// SetMode sets the mode on load and store operations.
//
// Returns InvalidOpCode if op is of the wrong class.
func (op OpCode) SetMode(mode Mode) OpCode {
if op.Class().encoding() != loadOrStore || !valid(OpCode(mode), modeMask) {
return InvalidOpCode
}
return (op & ^modeMask) | OpCode(mode)
}
// SetSize sets the size on load and store operations.
//
// Returns InvalidOpCode if op is of the wrong class.
func (op OpCode) SetSize(size Size) OpCode {
if op.Class().encoding() != loadOrStore || !valid(OpCode(size), sizeMask) {
return InvalidOpCode
}
return (op & ^sizeMask) | OpCode(size)
}
// SetSource sets the source on jump and ALU operations.
//
// Returns InvalidOpCode if op is of the wrong class.
func (op OpCode) SetSource(source Source) OpCode {
if op.Class().encoding() != jumpOrALU || !valid(OpCode(source), sourceMask) {
return InvalidOpCode
}
return (op & ^sourceMask) | OpCode(source)
}
// SetALUOp sets the ALUOp on ALU operations.
//
// Returns InvalidOpCode if op is of the wrong class.
func (op OpCode) SetALUOp(alu ALUOp) OpCode {
class := op.Class()
if (class != ALUClass && class != ALU64Class) || !valid(OpCode(alu), aluMask) {
return InvalidOpCode
}
return (op & ^aluMask) | OpCode(alu)
}
// SetJumpOp sets the JumpOp on jump operations.
//
// Returns InvalidOpCode if op is of the wrong class.
func (op OpCode) SetJumpOp(jump JumpOp) OpCode {
if op.Class() != JumpClass || !valid(OpCode(jump), jumpMask) {
return InvalidOpCode
}
return (op & ^jumpMask) | OpCode(jump)
}
func (op OpCode) String() string {
var f strings.Builder
switch class := op.Class(); class {
case LdClass, LdXClass, StClass, StXClass:
f.WriteString(strings.TrimSuffix(class.String(), "Class"))
mode := op.Mode()
f.WriteString(strings.TrimSuffix(mode.String(), "Mode"))
switch op.Size() {
case DWord:
f.WriteString("DW")
case Word:
f.WriteString("W")
case Half:
f.WriteString("H")
case Byte:
f.WriteString("B")
}
case ALU64Class, ALUClass:
f.WriteString(op.ALUOp().String())
if op.ALUOp() == Swap {
// Width for Endian is controlled by Constant
f.WriteString(op.Endianness().String())
} else {
if class == ALUClass {
f.WriteString("32")
}
f.WriteString(strings.TrimSuffix(op.Source().String(), "Source"))
}
case JumpClass:
f.WriteString(op.JumpOp().String())
if jop := op.JumpOp(); jop != Exit && jop != Call {
f.WriteString(strings.TrimSuffix(op.Source().String(), "Source"))
}
default:
fmt.Fprintf(&f, "OpCode(%#x)", uint8(op))
}
return f.String()
}
// valid returns true if all bits in value are covered by mask.
func valid(value, mask OpCode) bool {
return value & ^mask == 0
}

View File

@ -0,0 +1,38 @@
// Code generated by "stringer -output opcode_string.go -type=Class"; DO NOT EDIT.
package asm
import "strconv"
func _() {
// An "invalid array index" compiler error signifies that the constant values have changed.
// Re-run the stringer command to generate them again.
var x [1]struct{}
_ = x[LdClass-0]
_ = x[LdXClass-1]
_ = x[StClass-2]
_ = x[StXClass-3]
_ = x[ALUClass-4]
_ = x[JumpClass-5]
_ = x[ALU64Class-7]
}
const (
_Class_name_0 = "LdClassLdXClassStClassStXClassALUClassJumpClass"
_Class_name_1 = "ALU64Class"
)
var (
_Class_index_0 = [...]uint8{0, 7, 15, 22, 30, 38, 47}
)
func (i Class) String() string {
switch {
case 0 <= i && i <= 5:
return _Class_name_0[_Class_index_0[i]:_Class_index_0[i+1]]
case i == 7:
return _Class_name_1
default:
return "Class(" + strconv.FormatInt(int64(i), 10) + ")"
}
}

View File

@ -0,0 +1,49 @@
package asm
import (
"fmt"
)
// Register is the source or destination of most operations.
type Register uint8
// R0 contains return values.
const R0 Register = 0
// Registers for function arguments.
const (
R1 Register = R0 + 1 + iota
R2
R3
R4
R5
)
// Callee saved registers preserved by function calls.
const (
R6 Register = R5 + 1 + iota
R7
R8
R9
)
// Read-only frame pointer to access stack.
const (
R10 Register = R9 + 1
RFP = R10
)
// Pseudo registers used by 64bit loads and jumps
const (
PseudoMapFD = R1 // BPF_PSEUDO_MAP_FD
PseudoMapValue = R2 // BPF_PSEUDO_MAP_VALUE
PseudoCall = R1 // BPF_PSEUDO_CALL
)
func (r Register) String() string {
v := uint8(r)
if v == 10 {
return "rfp"
}
return fmt.Sprintf("r%d", v)
}

616
src/runtime/vendor/github.com/cilium/ebpf/collection.go generated vendored Normal file
View File

@ -0,0 +1,616 @@
package ebpf
import (
"errors"
"fmt"
"io"
"math"
"reflect"
"strings"
"github.com/cilium/ebpf/asm"
"github.com/cilium/ebpf/internal"
"github.com/cilium/ebpf/internal/btf"
)
// CollectionOptions control loading a collection into the kernel.
//
// Maps and Programs are passed to NewMapWithOptions and NewProgramsWithOptions.
type CollectionOptions struct {
Maps MapOptions
Programs ProgramOptions
}
// CollectionSpec describes a collection.
type CollectionSpec struct {
Maps map[string]*MapSpec
Programs map[string]*ProgramSpec
}
// Copy returns a recursive copy of the spec.
func (cs *CollectionSpec) Copy() *CollectionSpec {
if cs == nil {
return nil
}
cpy := CollectionSpec{
Maps: make(map[string]*MapSpec, len(cs.Maps)),
Programs: make(map[string]*ProgramSpec, len(cs.Programs)),
}
for name, spec := range cs.Maps {
cpy.Maps[name] = spec.Copy()
}
for name, spec := range cs.Programs {
cpy.Programs[name] = spec.Copy()
}
return &cpy
}
// RewriteMaps replaces all references to specific maps.
//
// Use this function to use pre-existing maps instead of creating new ones
// when calling NewCollection. Any named maps are removed from CollectionSpec.Maps.
//
// Returns an error if a named map isn't used in at least one program.
func (cs *CollectionSpec) RewriteMaps(maps map[string]*Map) error {
for symbol, m := range maps {
// have we seen a program that uses this symbol / map
seen := false
fd := m.FD()
for progName, progSpec := range cs.Programs {
err := progSpec.Instructions.RewriteMapPtr(symbol, fd)
switch {
case err == nil:
seen = true
case asm.IsUnreferencedSymbol(err):
// Not all programs need to use the map
default:
return fmt.Errorf("program %s: %w", progName, err)
}
}
if !seen {
return fmt.Errorf("map %s not referenced by any programs", symbol)
}
// Prevent NewCollection from creating rewritten maps
delete(cs.Maps, symbol)
}
return nil
}
// RewriteConstants replaces the value of multiple constants.
//
// The constant must be defined like so in the C program:
//
// volatile const type foobar;
// volatile const type foobar = default;
//
// Replacement values must be of the same length as the C sizeof(type).
// If necessary, they are marshalled according to the same rules as
// map values.
//
// From Linux 5.5 the verifier will use constants to eliminate dead code.
//
// Returns an error if a constant doesn't exist.
func (cs *CollectionSpec) RewriteConstants(consts map[string]interface{}) error {
rodata := cs.Maps[".rodata"]
if rodata == nil {
return errors.New("missing .rodata section")
}
if rodata.BTF == nil {
return errors.New(".rodata section has no BTF")
}
if n := len(rodata.Contents); n != 1 {
return fmt.Errorf("expected one key in .rodata, found %d", n)
}
kv := rodata.Contents[0]
value, ok := kv.Value.([]byte)
if !ok {
return fmt.Errorf("first value in .rodata is %T not []byte", kv.Value)
}
buf := make([]byte, len(value))
copy(buf, value)
err := patchValue(buf, btf.MapValue(rodata.BTF), consts)
if err != nil {
return err
}
rodata.Contents[0] = MapKV{kv.Key, buf}
return nil
}
// Assign the contents of a CollectionSpec to a struct.
//
// This function is a short-cut to manually checking the presence
// of maps and programs in a collection spec. Consider using bpf2go if this
// sounds useful.
//
// The argument to must be a pointer to a struct. A field of the
// struct is updated with values from Programs or Maps if it
// has an `ebpf` tag and its type is *ProgramSpec or *MapSpec.
// The tag gives the name of the program or map as found in
// the CollectionSpec.
//
// struct {
// Foo *ebpf.ProgramSpec `ebpf:"xdp_foo"`
// Bar *ebpf.MapSpec `ebpf:"bar_map"`
// Ignored int
// }
//
// Returns an error if any of the fields can't be found, or
// if the same map or program is assigned multiple times.
func (cs *CollectionSpec) Assign(to interface{}) error {
valueOf := func(typ reflect.Type, name string) (reflect.Value, error) {
switch typ {
case reflect.TypeOf((*ProgramSpec)(nil)):
p := cs.Programs[name]
if p == nil {
return reflect.Value{}, fmt.Errorf("missing program %q", name)
}
return reflect.ValueOf(p), nil
case reflect.TypeOf((*MapSpec)(nil)):
m := cs.Maps[name]
if m == nil {
return reflect.Value{}, fmt.Errorf("missing map %q", name)
}
return reflect.ValueOf(m), nil
default:
return reflect.Value{}, fmt.Errorf("unsupported type %s", typ)
}
}
return assignValues(to, valueOf)
}
// LoadAndAssign maps and programs into the kernel and assign them to a struct.
//
// This function is a short-cut to manually checking the presence
// of maps and programs in a collection spec. Consider using bpf2go if this
// sounds useful.
//
// The argument to must be a pointer to a struct. A field of the
// struct is updated with values from Programs or Maps if it
// has an `ebpf` tag and its type is *Program or *Map.
// The tag gives the name of the program or map as found in
// the CollectionSpec.
//
// struct {
// Foo *ebpf.Program `ebpf:"xdp_foo"`
// Bar *ebpf.Map `ebpf:"bar_map"`
// Ignored int
// }
//
// opts may be nil.
//
// Returns an error if any of the fields can't be found, or
// if the same map or program is assigned multiple times.
func (cs *CollectionSpec) LoadAndAssign(to interface{}, opts *CollectionOptions) error {
if opts == nil {
opts = &CollectionOptions{}
}
loadMap, loadProgram, done, cleanup := lazyLoadCollection(cs, opts)
defer cleanup()
valueOf := func(typ reflect.Type, name string) (reflect.Value, error) {
switch typ {
case reflect.TypeOf((*Program)(nil)):
p, err := loadProgram(name)
if err != nil {
return reflect.Value{}, err
}
return reflect.ValueOf(p), nil
case reflect.TypeOf((*Map)(nil)):
m, err := loadMap(name)
if err != nil {
return reflect.Value{}, err
}
return reflect.ValueOf(m), nil
default:
return reflect.Value{}, fmt.Errorf("unsupported type %s", typ)
}
}
if err := assignValues(to, valueOf); err != nil {
return err
}
done()
return nil
}
// Collection is a collection of Programs and Maps associated
// with their symbols
type Collection struct {
Programs map[string]*Program
Maps map[string]*Map
}
// NewCollection creates a Collection from a specification.
func NewCollection(spec *CollectionSpec) (*Collection, error) {
return NewCollectionWithOptions(spec, CollectionOptions{})
}
// NewCollectionWithOptions creates a Collection from a specification.
func NewCollectionWithOptions(spec *CollectionSpec, opts CollectionOptions) (*Collection, error) {
loadMap, loadProgram, done, cleanup := lazyLoadCollection(spec, &opts)
defer cleanup()
for mapName := range spec.Maps {
_, err := loadMap(mapName)
if err != nil {
return nil, err
}
}
for progName := range spec.Programs {
_, err := loadProgram(progName)
if err != nil {
return nil, err
}
}
maps, progs := done()
return &Collection{
progs,
maps,
}, nil
}
type handleCache struct {
btfHandles map[*btf.Spec]*btf.Handle
btfSpecs map[io.ReaderAt]*btf.Spec
}
func newHandleCache() *handleCache {
return &handleCache{
btfHandles: make(map[*btf.Spec]*btf.Handle),
btfSpecs: make(map[io.ReaderAt]*btf.Spec),
}
}
func (hc handleCache) btfHandle(spec *btf.Spec) (*btf.Handle, error) {
if hc.btfHandles[spec] != nil {
return hc.btfHandles[spec], nil
}
handle, err := btf.NewHandle(spec)
if err != nil {
return nil, err
}
hc.btfHandles[spec] = handle
return handle, nil
}
func (hc handleCache) btfSpec(rd io.ReaderAt) (*btf.Spec, error) {
if hc.btfSpecs[rd] != nil {
return hc.btfSpecs[rd], nil
}
spec, err := btf.LoadSpecFromReader(rd)
if err != nil {
return nil, err
}
hc.btfSpecs[rd] = spec
return spec, nil
}
func (hc handleCache) close() {
for _, handle := range hc.btfHandles {
handle.Close()
}
hc.btfHandles = nil
hc.btfSpecs = nil
}
func lazyLoadCollection(coll *CollectionSpec, opts *CollectionOptions) (
loadMap func(string) (*Map, error),
loadProgram func(string) (*Program, error),
done func() (map[string]*Map, map[string]*Program),
cleanup func(),
) {
var (
maps = make(map[string]*Map)
progs = make(map[string]*Program)
handles = newHandleCache()
skipMapsAndProgs = false
)
cleanup = func() {
handles.close()
if skipMapsAndProgs {
return
}
for _, m := range maps {
m.Close()
}
for _, p := range progs {
p.Close()
}
}
done = func() (map[string]*Map, map[string]*Program) {
skipMapsAndProgs = true
return maps, progs
}
loadMap = func(mapName string) (*Map, error) {
if m := maps[mapName]; m != nil {
return m, nil
}
mapSpec := coll.Maps[mapName]
if mapSpec == nil {
return nil, fmt.Errorf("missing map %s", mapName)
}
m, err := newMapWithOptions(mapSpec, opts.Maps, handles)
if err != nil {
return nil, fmt.Errorf("map %s: %w", mapName, err)
}
maps[mapName] = m
return m, nil
}
loadProgram = func(progName string) (*Program, error) {
if prog := progs[progName]; prog != nil {
return prog, nil
}
progSpec := coll.Programs[progName]
if progSpec == nil {
return nil, fmt.Errorf("unknown program %s", progName)
}
progSpec = progSpec.Copy()
// Rewrite any reference to a valid map.
for i := range progSpec.Instructions {
ins := &progSpec.Instructions[i]
if !ins.IsLoadFromMap() || ins.Reference == "" {
continue
}
if uint32(ins.Constant) != math.MaxUint32 {
// Don't overwrite maps already rewritten, users can
// rewrite programs in the spec themselves
continue
}
m, err := loadMap(ins.Reference)
if err != nil {
return nil, fmt.Errorf("program %s: %w", progName, err)
}
fd := m.FD()
if fd < 0 {
return nil, fmt.Errorf("map %s: %w", ins.Reference, internal.ErrClosedFd)
}
if err := ins.RewriteMapPtr(m.FD()); err != nil {
return nil, fmt.Errorf("progam %s: map %s: %w", progName, ins.Reference, err)
}
}
prog, err := newProgramWithOptions(progSpec, opts.Programs, handles)
if err != nil {
return nil, fmt.Errorf("program %s: %w", progName, err)
}
progs[progName] = prog
return prog, nil
}
return
}
// LoadCollection parses an object file and converts it to a collection.
func LoadCollection(file string) (*Collection, error) {
spec, err := LoadCollectionSpec(file)
if err != nil {
return nil, err
}
return NewCollection(spec)
}
// Close frees all maps and programs associated with the collection.
//
// The collection mustn't be used afterwards.
func (coll *Collection) Close() {
for _, prog := range coll.Programs {
prog.Close()
}
for _, m := range coll.Maps {
m.Close()
}
}
// DetachMap removes the named map from the Collection.
//
// This means that a later call to Close() will not affect this map.
//
// Returns nil if no map of that name exists.
func (coll *Collection) DetachMap(name string) *Map {
m := coll.Maps[name]
delete(coll.Maps, name)
return m
}
// DetachProgram removes the named program from the Collection.
//
// This means that a later call to Close() will not affect this program.
//
// Returns nil if no program of that name exists.
func (coll *Collection) DetachProgram(name string) *Program {
p := coll.Programs[name]
delete(coll.Programs, name)
return p
}
// Assign the contents of a collection to a struct.
//
// Deprecated: use CollectionSpec.Assign instead. It provides the same
// functionality but creates only the maps and programs requested.
func (coll *Collection) Assign(to interface{}) error {
assignedMaps := make(map[string]struct{})
assignedPrograms := make(map[string]struct{})
valueOf := func(typ reflect.Type, name string) (reflect.Value, error) {
switch typ {
case reflect.TypeOf((*Program)(nil)):
p := coll.Programs[name]
if p == nil {
return reflect.Value{}, fmt.Errorf("missing program %q", name)
}
assignedPrograms[name] = struct{}{}
return reflect.ValueOf(p), nil
case reflect.TypeOf((*Map)(nil)):
m := coll.Maps[name]
if m == nil {
return reflect.Value{}, fmt.Errorf("missing map %q", name)
}
assignedMaps[name] = struct{}{}
return reflect.ValueOf(m), nil
default:
return reflect.Value{}, fmt.Errorf("unsupported type %s", typ)
}
}
if err := assignValues(to, valueOf); err != nil {
return err
}
for name := range assignedPrograms {
coll.DetachProgram(name)
}
for name := range assignedMaps {
coll.DetachMap(name)
}
return nil
}
func assignValues(to interface{}, valueOf func(reflect.Type, string) (reflect.Value, error)) error {
type structField struct {
reflect.StructField
value reflect.Value
}
var (
fields []structField
visitedTypes = make(map[reflect.Type]bool)
flattenStruct func(reflect.Value) error
)
flattenStruct = func(structVal reflect.Value) error {
structType := structVal.Type()
if structType.Kind() != reflect.Struct {
return fmt.Errorf("%s is not a struct", structType)
}
if visitedTypes[structType] {
return fmt.Errorf("recursion on type %s", structType)
}
for i := 0; i < structType.NumField(); i++ {
field := structField{structType.Field(i), structVal.Field(i)}
name := field.Tag.Get("ebpf")
if name != "" {
fields = append(fields, field)
continue
}
var err error
switch field.Type.Kind() {
case reflect.Ptr:
if field.Type.Elem().Kind() != reflect.Struct {
continue
}
if field.value.IsNil() {
return fmt.Errorf("nil pointer to %s", structType)
}
err = flattenStruct(field.value.Elem())
case reflect.Struct:
err = flattenStruct(field.value)
default:
continue
}
if err != nil {
return fmt.Errorf("field %s: %w", field.Name, err)
}
}
return nil
}
toValue := reflect.ValueOf(to)
if toValue.Type().Kind() != reflect.Ptr {
return fmt.Errorf("%T is not a pointer to struct", to)
}
if toValue.IsNil() {
return fmt.Errorf("nil pointer to %T", to)
}
if err := flattenStruct(toValue.Elem()); err != nil {
return err
}
type elem struct {
// Either *Map or *Program
typ reflect.Type
name string
}
assignedTo := make(map[elem]string)
for _, field := range fields {
name := field.Tag.Get("ebpf")
if strings.Contains(name, ",") {
return fmt.Errorf("field %s: ebpf tag contains a comma", field.Name)
}
e := elem{field.Type, name}
if assignedField := assignedTo[e]; assignedField != "" {
return fmt.Errorf("field %s: %q was already assigned to %s", field.Name, name, assignedField)
}
value, err := valueOf(field.Type, name)
if err != nil {
return fmt.Errorf("field %s: %w", field.Name, err)
}
if !field.value.CanSet() {
return fmt.Errorf("field %s: can't set value", field.Name)
}
field.value.Set(value)
assignedTo[e] = field.Name
}
return nil
}

16
src/runtime/vendor/github.com/cilium/ebpf/doc.go generated vendored Normal file
View File

@ -0,0 +1,16 @@
// Package ebpf is a toolkit for working with eBPF programs.
//
// eBPF programs are small snippets of code which are executed directly
// in a VM in the Linux kernel, which makes them very fast and flexible.
// Many Linux subsystems now accept eBPF programs. This makes it possible
// to implement highly application specific logic inside the kernel,
// without having to modify the actual kernel itself.
//
// This package is designed for long-running processes which
// want to use eBPF to implement part of their application logic. It has no
// run-time dependencies outside of the library and the Linux kernel itself.
// eBPF code should be compiled ahead of time using clang, and shipped with
// your application as any other resource.
//
// Use the link subpackage to attach a loaded program to a hook in the kernel.
package ebpf

953
src/runtime/vendor/github.com/cilium/ebpf/elf_reader.go generated vendored Normal file
View File

@ -0,0 +1,953 @@
package ebpf
import (
"bufio"
"bytes"
"debug/elf"
"encoding/binary"
"errors"
"fmt"
"io"
"math"
"os"
"strings"
"github.com/cilium/ebpf/asm"
"github.com/cilium/ebpf/internal"
"github.com/cilium/ebpf/internal/btf"
"github.com/cilium/ebpf/internal/unix"
)
// elfCode is a convenience to reduce the amount of arguments that have to
// be passed around explicitly. You should treat it's contents as immutable.
type elfCode struct {
*internal.SafeELFFile
sections map[elf.SectionIndex]*elfSection
license string
version uint32
btf *btf.Spec
}
// LoadCollectionSpec parses an ELF file into a CollectionSpec.
func LoadCollectionSpec(file string) (*CollectionSpec, error) {
f, err := os.Open(file)
if err != nil {
return nil, err
}
defer f.Close()
spec, err := LoadCollectionSpecFromReader(f)
if err != nil {
return nil, fmt.Errorf("file %s: %w", file, err)
}
return spec, nil
}
// LoadCollectionSpecFromReader parses an ELF file into a CollectionSpec.
func LoadCollectionSpecFromReader(rd io.ReaderAt) (*CollectionSpec, error) {
f, err := internal.NewSafeELFFile(rd)
if err != nil {
return nil, err
}
defer f.Close()
var (
licenseSection *elf.Section
versionSection *elf.Section
sections = make(map[elf.SectionIndex]*elfSection)
relSections = make(map[elf.SectionIndex]*elf.Section)
)
// This is the target of relocations generated by inline assembly.
sections[elf.SHN_UNDEF] = newElfSection(new(elf.Section), undefSection)
// Collect all the sections we're interested in. This includes relocations
// which we parse later.
for i, sec := range f.Sections {
idx := elf.SectionIndex(i)
switch {
case strings.HasPrefix(sec.Name, "license"):
licenseSection = sec
case strings.HasPrefix(sec.Name, "version"):
versionSection = sec
case strings.HasPrefix(sec.Name, "maps"):
sections[idx] = newElfSection(sec, mapSection)
case sec.Name == ".maps":
sections[idx] = newElfSection(sec, btfMapSection)
case sec.Name == ".bss" || sec.Name == ".data" || strings.HasPrefix(sec.Name, ".rodata"):
sections[idx] = newElfSection(sec, dataSection)
case sec.Type == elf.SHT_REL:
// Store relocations under the section index of the target
relSections[elf.SectionIndex(sec.Info)] = sec
case sec.Type == elf.SHT_PROGBITS && (sec.Flags&elf.SHF_EXECINSTR) != 0 && sec.Size > 0:
sections[idx] = newElfSection(sec, programSection)
}
}
license, err := loadLicense(licenseSection)
if err != nil {
return nil, fmt.Errorf("load license: %w", err)
}
version, err := loadVersion(versionSection, f.ByteOrder)
if err != nil {
return nil, fmt.Errorf("load version: %w", err)
}
btfSpec, err := btf.LoadSpecFromReader(rd)
if err != nil && !errors.Is(err, btf.ErrNotFound) {
return nil, fmt.Errorf("load BTF: %w", err)
}
// Assign symbols to all the sections we're interested in.
symbols, err := f.Symbols()
if err != nil {
return nil, fmt.Errorf("load symbols: %v", err)
}
for _, symbol := range symbols {
idx := symbol.Section
symType := elf.ST_TYPE(symbol.Info)
section := sections[idx]
if section == nil {
continue
}
// Older versions of LLVM don't tag symbols correctly, so keep
// all NOTYPE ones.
keep := symType == elf.STT_NOTYPE
switch section.kind {
case mapSection, btfMapSection, dataSection:
keep = keep || symType == elf.STT_OBJECT
case programSection:
keep = keep || symType == elf.STT_FUNC
}
if !keep || symbol.Name == "" {
continue
}
section.symbols[symbol.Value] = symbol
}
ec := &elfCode{
SafeELFFile: f,
sections: sections,
license: license,
version: version,
btf: btfSpec,
}
// Go through relocation sections, and parse the ones for sections we're
// interested in. Make sure that relocations point at valid sections.
for idx, relSection := range relSections {
section := sections[idx]
if section == nil {
continue
}
rels, err := ec.loadRelocations(relSection, symbols)
if err != nil {
return nil, fmt.Errorf("relocation for section %q: %w", section.Name, err)
}
for _, rel := range rels {
target := sections[rel.Section]
if target == nil {
return nil, fmt.Errorf("section %q: reference to %q in section %s: %w", section.Name, rel.Name, rel.Section, ErrNotSupported)
}
if target.Flags&elf.SHF_STRINGS > 0 {
return nil, fmt.Errorf("section %q: string is not stack allocated: %w", section.Name, ErrNotSupported)
}
target.references++
}
section.relocations = rels
}
// Collect all the various ways to define maps.
maps := make(map[string]*MapSpec)
if err := ec.loadMaps(maps); err != nil {
return nil, fmt.Errorf("load maps: %w", err)
}
if err := ec.loadBTFMaps(maps); err != nil {
return nil, fmt.Errorf("load BTF maps: %w", err)
}
if err := ec.loadDataSections(maps); err != nil {
return nil, fmt.Errorf("load data sections: %w", err)
}
// Finally, collect programs and link them.
progs, err := ec.loadPrograms()
if err != nil {
return nil, fmt.Errorf("load programs: %w", err)
}
return &CollectionSpec{maps, progs}, nil
}
func loadLicense(sec *elf.Section) (string, error) {
if sec == nil {
return "", nil
}
data, err := sec.Data()
if err != nil {
return "", fmt.Errorf("section %s: %v", sec.Name, err)
}
return string(bytes.TrimRight(data, "\000")), nil
}
func loadVersion(sec *elf.Section, bo binary.ByteOrder) (uint32, error) {
if sec == nil {
return 0, nil
}
var version uint32
if err := binary.Read(sec.Open(), bo, &version); err != nil {
return 0, fmt.Errorf("section %s: %v", sec.Name, err)
}
return version, nil
}
type elfSectionKind int
const (
undefSection elfSectionKind = iota
mapSection
btfMapSection
programSection
dataSection
)
type elfSection struct {
*elf.Section
kind elfSectionKind
// Offset from the start of the section to a symbol
symbols map[uint64]elf.Symbol
// Offset from the start of the section to a relocation, which points at
// a symbol in another section.
relocations map[uint64]elf.Symbol
// The number of relocations pointing at this section.
references int
}
func newElfSection(section *elf.Section, kind elfSectionKind) *elfSection {
return &elfSection{
section,
kind,
make(map[uint64]elf.Symbol),
make(map[uint64]elf.Symbol),
0,
}
}
func (ec *elfCode) loadPrograms() (map[string]*ProgramSpec, error) {
var (
progs []*ProgramSpec
libs []*ProgramSpec
)
for _, sec := range ec.sections {
if sec.kind != programSection {
continue
}
if len(sec.symbols) == 0 {
return nil, fmt.Errorf("section %v: missing symbols", sec.Name)
}
funcSym, ok := sec.symbols[0]
if !ok {
return nil, fmt.Errorf("section %v: no label at start", sec.Name)
}
insns, length, err := ec.loadInstructions(sec)
if err != nil {
return nil, fmt.Errorf("program %s: %w", funcSym.Name, err)
}
progType, attachType, progFlags, attachTo := getProgType(sec.Name)
spec := &ProgramSpec{
Name: funcSym.Name,
Type: progType,
Flags: progFlags,
AttachType: attachType,
AttachTo: attachTo,
License: ec.license,
KernelVersion: ec.version,
Instructions: insns,
ByteOrder: ec.ByteOrder,
}
if ec.btf != nil {
spec.BTF, err = ec.btf.Program(sec.Name, length)
if err != nil && !errors.Is(err, btf.ErrNoExtendedInfo) {
return nil, fmt.Errorf("program %s: %w", funcSym.Name, err)
}
}
if spec.Type == UnspecifiedProgram {
// There is no single name we can use for "library" sections,
// since they may contain multiple functions. We'll decode the
// labels they contain later on, and then link sections that way.
libs = append(libs, spec)
} else {
progs = append(progs, spec)
}
}
res := make(map[string]*ProgramSpec, len(progs))
for _, prog := range progs {
err := link(prog, libs)
if err != nil {
return nil, fmt.Errorf("program %s: %w", prog.Name, err)
}
res[prog.Name] = prog
}
return res, nil
}
func (ec *elfCode) loadInstructions(section *elfSection) (asm.Instructions, uint64, error) {
var (
r = bufio.NewReader(section.Open())
insns asm.Instructions
offset uint64
)
for {
var ins asm.Instruction
n, err := ins.Unmarshal(r, ec.ByteOrder)
if err == io.EOF {
return insns, offset, nil
}
if err != nil {
return nil, 0, fmt.Errorf("offset %d: %w", offset, err)
}
ins.Symbol = section.symbols[offset].Name
if rel, ok := section.relocations[offset]; ok {
if err = ec.relocateInstruction(&ins, rel); err != nil {
return nil, 0, fmt.Errorf("offset %d: relocate instruction: %w", offset, err)
}
}
insns = append(insns, ins)
offset += n
}
}
func (ec *elfCode) relocateInstruction(ins *asm.Instruction, rel elf.Symbol) error {
var (
typ = elf.ST_TYPE(rel.Info)
bind = elf.ST_BIND(rel.Info)
name = rel.Name
)
target := ec.sections[rel.Section]
switch target.kind {
case mapSection, btfMapSection:
if bind != elf.STB_GLOBAL {
return fmt.Errorf("possible erroneous static qualifier on map definition: found reference to %q", name)
}
if typ != elf.STT_OBJECT && typ != elf.STT_NOTYPE {
// STT_NOTYPE is generated on clang < 8 which doesn't tag
// relocations appropriately.
return fmt.Errorf("map load: incorrect relocation type %v", typ)
}
ins.Src = asm.PseudoMapFD
// Mark the instruction as needing an update when creating the
// collection.
if err := ins.RewriteMapPtr(-1); err != nil {
return err
}
case dataSection:
var offset uint32
switch typ {
case elf.STT_SECTION:
if bind != elf.STB_LOCAL {
return fmt.Errorf("direct load: %s: unsupported relocation %s", name, bind)
}
// This is really a reference to a static symbol, which clang doesn't
// emit a symbol table entry for. Instead it encodes the offset in
// the instruction itself.
offset = uint32(uint64(ins.Constant))
case elf.STT_OBJECT:
if bind != elf.STB_GLOBAL {
return fmt.Errorf("direct load: %s: unsupported relocation %s", name, bind)
}
offset = uint32(rel.Value)
default:
return fmt.Errorf("incorrect relocation type %v for direct map load", typ)
}
// We rely on using the name of the data section as the reference. It
// would be nicer to keep the real name in case of an STT_OBJECT, but
// it's not clear how to encode that into Instruction.
name = target.Name
// The kernel expects the offset in the second basic BPF instruction.
ins.Constant = int64(uint64(offset) << 32)
ins.Src = asm.PseudoMapValue
// Mark the instruction as needing an update when creating the
// collection.
if err := ins.RewriteMapPtr(-1); err != nil {
return err
}
case programSection:
if ins.OpCode.JumpOp() != asm.Call {
return fmt.Errorf("not a call instruction: %s", ins)
}
if ins.Src != asm.PseudoCall {
return fmt.Errorf("call: %s: incorrect source register", name)
}
switch typ {
case elf.STT_NOTYPE, elf.STT_FUNC:
if bind != elf.STB_GLOBAL {
return fmt.Errorf("call: %s: unsupported binding: %s", name, bind)
}
case elf.STT_SECTION:
if bind != elf.STB_LOCAL {
return fmt.Errorf("call: %s: unsupported binding: %s", name, bind)
}
// The function we want to call is in the indicated section,
// at the offset encoded in the instruction itself. Reverse
// the calculation to find the real function we're looking for.
// A value of -1 references the first instruction in the section.
offset := int64(int32(ins.Constant)+1) * asm.InstructionSize
if offset < 0 {
return fmt.Errorf("call: %s: invalid offset %d", name, offset)
}
sym, ok := target.symbols[uint64(offset)]
if !ok {
return fmt.Errorf("call: %s: no symbol at offset %d", name, offset)
}
ins.Constant = -1
name = sym.Name
default:
return fmt.Errorf("call: %s: invalid symbol type %s", name, typ)
}
case undefSection:
if bind != elf.STB_GLOBAL {
return fmt.Errorf("asm relocation: %s: unsupported binding: %s", name, bind)
}
if typ != elf.STT_NOTYPE {
return fmt.Errorf("asm relocation: %s: unsupported type %s", name, typ)
}
// There is nothing to do here but set ins.Reference.
default:
return fmt.Errorf("relocation to %q: %w", target.Name, ErrNotSupported)
}
ins.Reference = name
return nil
}
func (ec *elfCode) loadMaps(maps map[string]*MapSpec) error {
for _, sec := range ec.sections {
if sec.kind != mapSection {
continue
}
nSym := len(sec.symbols)
if nSym == 0 {
return fmt.Errorf("section %v: no symbols", sec.Name)
}
if sec.Size%uint64(nSym) != 0 {
return fmt.Errorf("section %v: map descriptors are not of equal size", sec.Name)
}
var (
r = bufio.NewReader(sec.Open())
size = sec.Size / uint64(nSym)
)
for i, offset := 0, uint64(0); i < nSym; i, offset = i+1, offset+size {
mapSym, ok := sec.symbols[offset]
if !ok {
return fmt.Errorf("section %s: missing symbol for map at offset %d", sec.Name, offset)
}
mapName := mapSym.Name
if maps[mapName] != nil {
return fmt.Errorf("section %v: map %v already exists", sec.Name, mapSym)
}
lr := io.LimitReader(r, int64(size))
spec := MapSpec{
Name: SanitizeName(mapName, -1),
}
switch {
case binary.Read(lr, ec.ByteOrder, &spec.Type) != nil:
return fmt.Errorf("map %s: missing type", mapName)
case binary.Read(lr, ec.ByteOrder, &spec.KeySize) != nil:
return fmt.Errorf("map %s: missing key size", mapName)
case binary.Read(lr, ec.ByteOrder, &spec.ValueSize) != nil:
return fmt.Errorf("map %s: missing value size", mapName)
case binary.Read(lr, ec.ByteOrder, &spec.MaxEntries) != nil:
return fmt.Errorf("map %s: missing max entries", mapName)
case binary.Read(lr, ec.ByteOrder, &spec.Flags) != nil:
return fmt.Errorf("map %s: missing flags", mapName)
}
if _, err := io.Copy(internal.DiscardZeroes{}, lr); err != nil {
return fmt.Errorf("map %s: unknown and non-zero fields in definition", mapName)
}
if err := spec.clampPerfEventArraySize(); err != nil {
return fmt.Errorf("map %s: %w", mapName, err)
}
maps[mapName] = &spec
}
}
return nil
}
func (ec *elfCode) loadBTFMaps(maps map[string]*MapSpec) error {
for _, sec := range ec.sections {
if sec.kind != btfMapSection {
continue
}
if ec.btf == nil {
return fmt.Errorf("missing BTF")
}
_, err := io.Copy(internal.DiscardZeroes{}, bufio.NewReader(sec.Open()))
if err != nil {
return fmt.Errorf("section %v: initializing BTF map definitions: %w", sec.Name, internal.ErrNotSupported)
}
var ds btf.Datasec
if err := ec.btf.FindType(sec.Name, &ds); err != nil {
return fmt.Errorf("cannot find section '%s' in BTF: %w", sec.Name, err)
}
for _, vs := range ds.Vars {
v, ok := vs.Type.(*btf.Var)
if !ok {
return fmt.Errorf("section %v: unexpected type %s", sec.Name, vs.Type)
}
name := string(v.Name)
if maps[name] != nil {
return fmt.Errorf("section %v: map %s already exists", sec.Name, name)
}
mapStruct, ok := v.Type.(*btf.Struct)
if !ok {
return fmt.Errorf("expected struct, got %s", v.Type)
}
mapSpec, err := mapSpecFromBTF(name, mapStruct, false, ec.btf)
if err != nil {
return fmt.Errorf("map %v: %w", name, err)
}
if err := mapSpec.clampPerfEventArraySize(); err != nil {
return fmt.Errorf("map %v: %w", name, err)
}
maps[name] = mapSpec
}
}
return nil
}
// mapSpecFromBTF produces a MapSpec based on a btf.Struct def representing
// a BTF map definition. The name and spec arguments will be copied to the
// resulting MapSpec, and inner must be true on any resursive invocations.
func mapSpecFromBTF(name string, def *btf.Struct, inner bool, spec *btf.Spec) (*MapSpec, error) {
var (
key, value btf.Type
keySize, valueSize uint32
mapType, flags, maxEntries uint32
pinType PinType
innerMapSpec *MapSpec
err error
)
for i, member := range def.Members {
switch member.Name {
case "type":
mapType, err = uintFromBTF(member.Type)
if err != nil {
return nil, fmt.Errorf("can't get type: %w", err)
}
case "map_flags":
flags, err = uintFromBTF(member.Type)
if err != nil {
return nil, fmt.Errorf("can't get BTF map flags: %w", err)
}
case "max_entries":
maxEntries, err = uintFromBTF(member.Type)
if err != nil {
return nil, fmt.Errorf("can't get BTF map max entries: %w", err)
}
case "key":
if keySize != 0 {
return nil, errors.New("both key and key_size given")
}
pk, ok := member.Type.(*btf.Pointer)
if !ok {
return nil, fmt.Errorf("key type is not a pointer: %T", member.Type)
}
key = pk.Target
size, err := btf.Sizeof(pk.Target)
if err != nil {
return nil, fmt.Errorf("can't get size of BTF key: %w", err)
}
keySize = uint32(size)
case "value":
if valueSize != 0 {
return nil, errors.New("both value and value_size given")
}
vk, ok := member.Type.(*btf.Pointer)
if !ok {
return nil, fmt.Errorf("value type is not a pointer: %T", member.Type)
}
value = vk.Target
size, err := btf.Sizeof(vk.Target)
if err != nil {
return nil, fmt.Errorf("can't get size of BTF value: %w", err)
}
valueSize = uint32(size)
case "key_size":
// Key needs to be nil and keySize needs to be 0 for key_size to be
// considered a valid member.
if key != nil || keySize != 0 {
return nil, errors.New("both key and key_size given")
}
keySize, err = uintFromBTF(member.Type)
if err != nil {
return nil, fmt.Errorf("can't get BTF key size: %w", err)
}
case "value_size":
// Value needs to be nil and valueSize needs to be 0 for value_size to be
// considered a valid member.
if value != nil || valueSize != 0 {
return nil, errors.New("both value and value_size given")
}
valueSize, err = uintFromBTF(member.Type)
if err != nil {
return nil, fmt.Errorf("can't get BTF value size: %w", err)
}
case "pinning":
if inner {
return nil, errors.New("inner maps can't be pinned")
}
pinning, err := uintFromBTF(member.Type)
if err != nil {
return nil, fmt.Errorf("can't get pinning: %w", err)
}
pinType = PinType(pinning)
case "values":
// The 'values' field in BTF map definitions is used for declaring map
// value types that are references to other BPF objects, like other maps
// or programs. It is always expected to be an array of pointers.
if i != len(def.Members)-1 {
return nil, errors.New("'values' must be the last member in a BTF map definition")
}
if valueSize != 0 && valueSize != 4 {
return nil, errors.New("value_size must be 0 or 4")
}
valueSize = 4
valueType, err := resolveBTFArrayMacro(member.Type)
if err != nil {
return nil, fmt.Errorf("can't resolve type of member 'values': %w", err)
}
switch t := valueType.(type) {
case *btf.Struct:
// The values member pointing to an array of structs means we're expecting
// a map-in-map declaration.
if MapType(mapType) != ArrayOfMaps && MapType(mapType) != HashOfMaps {
return nil, errors.New("outer map needs to be an array or a hash of maps")
}
if inner {
return nil, fmt.Errorf("nested inner maps are not supported")
}
// This inner map spec is used as a map template, but it needs to be
// created as a traditional map before it can be used to do so.
// libbpf names the inner map template '<outer_name>.inner', but we
// opted for _inner to simplify validation logic. (dots only supported
// on kernels 5.2 and up)
// Pass the BTF spec from the parent object, since both parent and
// child must be created from the same BTF blob (on kernels that support BTF).
innerMapSpec, err = mapSpecFromBTF(name+"_inner", t, true, spec)
if err != nil {
return nil, fmt.Errorf("can't parse BTF map definition of inner map: %w", err)
}
default:
return nil, fmt.Errorf("unsupported value type %q in 'values' field", t)
}
default:
return nil, fmt.Errorf("unrecognized field %s in BTF map definition", member.Name)
}
}
bm := btf.NewMap(spec, key, value)
return &MapSpec{
Name: SanitizeName(name, -1),
Type: MapType(mapType),
KeySize: keySize,
ValueSize: valueSize,
MaxEntries: maxEntries,
Flags: flags,
BTF: &bm,
Pinning: pinType,
InnerMap: innerMapSpec,
}, nil
}
// uintFromBTF resolves the __uint macro, which is a pointer to a sized
// array, e.g. for int (*foo)[10], this function will return 10.
func uintFromBTF(typ btf.Type) (uint32, error) {
ptr, ok := typ.(*btf.Pointer)
if !ok {
return 0, fmt.Errorf("not a pointer: %v", typ)
}
arr, ok := ptr.Target.(*btf.Array)
if !ok {
return 0, fmt.Errorf("not a pointer to array: %v", typ)
}
return arr.Nelems, nil
}
// resolveBTFArrayMacro resolves the __array macro, which declares an array
// of pointers to a given type. This function returns the target Type of
// the pointers in the array.
func resolveBTFArrayMacro(typ btf.Type) (btf.Type, error) {
arr, ok := typ.(*btf.Array)
if !ok {
return nil, fmt.Errorf("not an array: %v", typ)
}
ptr, ok := arr.Type.(*btf.Pointer)
if !ok {
return nil, fmt.Errorf("not an array of pointers: %v", typ)
}
return ptr.Target, nil
}
func (ec *elfCode) loadDataSections(maps map[string]*MapSpec) error {
for _, sec := range ec.sections {
if sec.kind != dataSection {
continue
}
if sec.references == 0 {
// Prune data sections which are not referenced by any
// instructions.
continue
}
if ec.btf == nil {
return errors.New("data sections require BTF, make sure all consts are marked as static")
}
btfMap, err := ec.btf.Datasec(sec.Name)
if err != nil {
return err
}
data, err := sec.Data()
if err != nil {
return fmt.Errorf("data section %s: can't get contents: %w", sec.Name, err)
}
if uint64(len(data)) > math.MaxUint32 {
return fmt.Errorf("data section %s: contents exceed maximum size", sec.Name)
}
mapSpec := &MapSpec{
Name: SanitizeName(sec.Name, -1),
Type: Array,
KeySize: 4,
ValueSize: uint32(len(data)),
MaxEntries: 1,
Contents: []MapKV{{uint32(0), data}},
BTF: btfMap,
}
switch sec.Name {
case ".rodata":
mapSpec.Flags = unix.BPF_F_RDONLY_PROG
mapSpec.Freeze = true
case ".bss":
// The kernel already zero-initializes the map
mapSpec.Contents = nil
}
maps[sec.Name] = mapSpec
}
return nil
}
func getProgType(sectionName string) (ProgramType, AttachType, uint32, string) {
types := map[string]struct {
progType ProgramType
attachType AttachType
progFlags uint32
}{
// From https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/lib/bpf/libbpf.c
"socket": {SocketFilter, AttachNone, 0},
"seccomp": {SocketFilter, AttachNone, 0},
"kprobe/": {Kprobe, AttachNone, 0},
"uprobe/": {Kprobe, AttachNone, 0},
"kretprobe/": {Kprobe, AttachNone, 0},
"uretprobe/": {Kprobe, AttachNone, 0},
"tracepoint/": {TracePoint, AttachNone, 0},
"raw_tracepoint/": {RawTracepoint, AttachNone, 0},
"raw_tp/": {RawTracepoint, AttachNone, 0},
"tp_btf/": {Tracing, AttachTraceRawTp, 0},
"xdp": {XDP, AttachNone, 0},
"perf_event": {PerfEvent, AttachNone, 0},
"lwt_in": {LWTIn, AttachNone, 0},
"lwt_out": {LWTOut, AttachNone, 0},
"lwt_xmit": {LWTXmit, AttachNone, 0},
"lwt_seg6local": {LWTSeg6Local, AttachNone, 0},
"sockops": {SockOps, AttachCGroupSockOps, 0},
"sk_skb/stream_parser": {SkSKB, AttachSkSKBStreamParser, 0},
"sk_skb/stream_verdict": {SkSKB, AttachSkSKBStreamParser, 0},
"sk_msg": {SkMsg, AttachSkSKBStreamVerdict, 0},
"lirc_mode2": {LircMode2, AttachLircMode2, 0},
"flow_dissector": {FlowDissector, AttachFlowDissector, 0},
"iter/": {Tracing, AttachTraceIter, 0},
"fentry/": {Tracing, AttachTraceFEntry, 0},
"fmod_ret/": {Tracing, AttachModifyReturn, 0},
"fexit/": {Tracing, AttachTraceFExit, 0},
"fentry.s/": {Tracing, AttachTraceFEntry, unix.BPF_F_SLEEPABLE},
"fmod_ret.s/": {Tracing, AttachModifyReturn, unix.BPF_F_SLEEPABLE},
"fexit.s/": {Tracing, AttachTraceFExit, unix.BPF_F_SLEEPABLE},
"sk_lookup/": {SkLookup, AttachSkLookup, 0},
"lsm/": {LSM, AttachLSMMac, 0},
"lsm.s/": {LSM, AttachLSMMac, unix.BPF_F_SLEEPABLE},
"cgroup_skb/ingress": {CGroupSKB, AttachCGroupInetIngress, 0},
"cgroup_skb/egress": {CGroupSKB, AttachCGroupInetEgress, 0},
"cgroup/dev": {CGroupDevice, AttachCGroupDevice, 0},
"cgroup/skb": {CGroupSKB, AttachNone, 0},
"cgroup/sock": {CGroupSock, AttachCGroupInetSockCreate, 0},
"cgroup/post_bind4": {CGroupSock, AttachCGroupInet4PostBind, 0},
"cgroup/post_bind6": {CGroupSock, AttachCGroupInet6PostBind, 0},
"cgroup/bind4": {CGroupSockAddr, AttachCGroupInet4Bind, 0},
"cgroup/bind6": {CGroupSockAddr, AttachCGroupInet6Bind, 0},
"cgroup/connect4": {CGroupSockAddr, AttachCGroupInet4Connect, 0},
"cgroup/connect6": {CGroupSockAddr, AttachCGroupInet6Connect, 0},
"cgroup/sendmsg4": {CGroupSockAddr, AttachCGroupUDP4Sendmsg, 0},
"cgroup/sendmsg6": {CGroupSockAddr, AttachCGroupUDP6Sendmsg, 0},
"cgroup/recvmsg4": {CGroupSockAddr, AttachCGroupUDP4Recvmsg, 0},
"cgroup/recvmsg6": {CGroupSockAddr, AttachCGroupUDP6Recvmsg, 0},
"cgroup/sysctl": {CGroupSysctl, AttachCGroupSysctl, 0},
"cgroup/getsockopt": {CGroupSockopt, AttachCGroupGetsockopt, 0},
"cgroup/setsockopt": {CGroupSockopt, AttachCGroupSetsockopt, 0},
"classifier": {SchedCLS, AttachNone, 0},
"action": {SchedACT, AttachNone, 0},
}
for prefix, t := range types {
if !strings.HasPrefix(sectionName, prefix) {
continue
}
if !strings.HasSuffix(prefix, "/") {
return t.progType, t.attachType, t.progFlags, ""
}
return t.progType, t.attachType, t.progFlags, sectionName[len(prefix):]
}
return UnspecifiedProgram, AttachNone, 0, ""
}
func (ec *elfCode) loadRelocations(sec *elf.Section, symbols []elf.Symbol) (map[uint64]elf.Symbol, error) {
rels := make(map[uint64]elf.Symbol)
if sec.Entsize < 16 {
return nil, fmt.Errorf("section %s: relocations are less than 16 bytes", sec.Name)
}
r := bufio.NewReader(sec.Open())
for off := uint64(0); off < sec.Size; off += sec.Entsize {
ent := io.LimitReader(r, int64(sec.Entsize))
var rel elf.Rel64
if binary.Read(ent, ec.ByteOrder, &rel) != nil {
return nil, fmt.Errorf("can't parse relocation at offset %v", off)
}
symNo := int(elf.R_SYM64(rel.Info) - 1)
if symNo >= len(symbols) {
return nil, fmt.Errorf("offset %d: symbol %d doesn't exist", off, symNo)
}
symbol := symbols[symNo]
rels[rel.Off] = symbol
}
return rels, nil
}

View File

@ -0,0 +1,21 @@
// +build gofuzz
// Use with https://github.com/dvyukov/go-fuzz
package ebpf
import "bytes"
func FuzzLoadCollectionSpec(data []byte) int {
spec, err := LoadCollectionSpecFromReader(bytes.NewReader(data))
if err != nil {
if spec != nil {
panic("spec is not nil")
}
return 0
}
if spec == nil {
panic("spec is nil")
}
return 1
}

9
src/runtime/vendor/github.com/cilium/ebpf/go.mod generated vendored Normal file
View File

@ -0,0 +1,9 @@
module github.com/cilium/ebpf
go 1.15
require (
github.com/frankban/quicktest v1.11.3
github.com/google/go-cmp v0.5.4
golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c
)

13
src/runtime/vendor/github.com/cilium/ebpf/go.sum generated vendored Normal file
View File

@ -0,0 +1,13 @@
github.com/frankban/quicktest v1.11.3 h1:8sXhOn0uLys67V8EsXLc6eszDs8VXWxL3iRvebPhedY=
github.com/frankban/quicktest v1.11.3/go.mod h1:wRf/ReqHper53s+kmmSZizM8NamnL3IM0I9ntUbOk+k=
github.com/google/go-cmp v0.5.4 h1:L8R9j+yAqZuZjsqh/z+F1NCffTKKLShY6zXTItVIZ8M=
github.com/google/go-cmp v0.5.4/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/kr/pretty v0.2.1 h1:Fmg33tUaq4/8ym9TJN1x7sLJnHVwhP33CNkpYV/7rwI=
github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE=
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c h1:VwygUrnw9jn88c4u8GD3rZQbqrP/tgas88tPUbBxQrk=
golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=

239
src/runtime/vendor/github.com/cilium/ebpf/info.go generated vendored Normal file
View File

@ -0,0 +1,239 @@
package ebpf
import (
"bufio"
"encoding/hex"
"errors"
"fmt"
"io"
"os"
"strings"
"syscall"
"time"
"github.com/cilium/ebpf/internal"
)
// MapInfo describes a map.
type MapInfo struct {
Type MapType
id MapID
KeySize uint32
ValueSize uint32
MaxEntries uint32
Flags uint32
// Name as supplied by user space at load time.
Name string
}
func newMapInfoFromFd(fd *internal.FD) (*MapInfo, error) {
info, err := bpfGetMapInfoByFD(fd)
if errors.Is(err, syscall.EINVAL) {
return newMapInfoFromProc(fd)
}
if err != nil {
return nil, err
}
return &MapInfo{
MapType(info.map_type),
MapID(info.id),
info.key_size,
info.value_size,
info.max_entries,
info.map_flags,
// name is available from 4.15.
internal.CString(info.name[:]),
}, nil
}
func newMapInfoFromProc(fd *internal.FD) (*MapInfo, error) {
var mi MapInfo
err := scanFdInfo(fd, map[string]interface{}{
"map_type": &mi.Type,
"key_size": &mi.KeySize,
"value_size": &mi.ValueSize,
"max_entries": &mi.MaxEntries,
"map_flags": &mi.Flags,
})
if err != nil {
return nil, err
}
return &mi, nil
}
// ID returns the map ID.
//
// Available from 4.13.
//
// The bool return value indicates whether this optional field is available.
func (mi *MapInfo) ID() (MapID, bool) {
return mi.id, mi.id > 0
}
// programStats holds statistics of a program.
type programStats struct {
// Total accumulated runtime of the program ins ns.
runtime time.Duration
// Total number of times the program was called.
runCount uint64
}
// ProgramInfo describes a program.
type ProgramInfo struct {
Type ProgramType
id ProgramID
// Truncated hash of the BPF bytecode.
Tag string
// Name as supplied by user space at load time.
Name string
stats *programStats
}
func newProgramInfoFromFd(fd *internal.FD) (*ProgramInfo, error) {
info, err := bpfGetProgInfoByFD(fd)
if errors.Is(err, syscall.EINVAL) {
return newProgramInfoFromProc(fd)
}
if err != nil {
return nil, err
}
return &ProgramInfo{
Type: ProgramType(info.prog_type),
id: ProgramID(info.id),
// tag is available if the kernel supports BPF_PROG_GET_INFO_BY_FD.
Tag: hex.EncodeToString(info.tag[:]),
// name is available from 4.15.
Name: internal.CString(info.name[:]),
stats: &programStats{
runtime: time.Duration(info.run_time_ns),
runCount: info.run_cnt,
},
}, nil
}
func newProgramInfoFromProc(fd *internal.FD) (*ProgramInfo, error) {
var info ProgramInfo
err := scanFdInfo(fd, map[string]interface{}{
"prog_type": &info.Type,
"prog_tag": &info.Tag,
})
if errors.Is(err, errMissingFields) {
return nil, &internal.UnsupportedFeatureError{
Name: "reading program info from /proc/self/fdinfo",
MinimumVersion: internal.Version{4, 10, 0},
}
}
if err != nil {
return nil, err
}
return &info, nil
}
// ID returns the program ID.
//
// Available from 4.13.
//
// The bool return value indicates whether this optional field is available.
func (pi *ProgramInfo) ID() (ProgramID, bool) {
return pi.id, pi.id > 0
}
// RunCount returns the total number of times the program was called.
//
// Can return 0 if the collection of statistics is not enabled. See EnableStats().
// The bool return value indicates whether this optional field is available.
func (pi *ProgramInfo) RunCount() (uint64, bool) {
if pi.stats != nil {
return pi.stats.runCount, true
}
return 0, false
}
// Runtime returns the total accumulated runtime of the program.
//
// Can return 0 if the collection of statistics is not enabled. See EnableStats().
// The bool return value indicates whether this optional field is available.
func (pi *ProgramInfo) Runtime() (time.Duration, bool) {
if pi.stats != nil {
return pi.stats.runtime, true
}
return time.Duration(0), false
}
func scanFdInfo(fd *internal.FD, fields map[string]interface{}) error {
raw, err := fd.Value()
if err != nil {
return err
}
fh, err := os.Open(fmt.Sprintf("/proc/self/fdinfo/%d", raw))
if err != nil {
return err
}
defer fh.Close()
if err := scanFdInfoReader(fh, fields); err != nil {
return fmt.Errorf("%s: %w", fh.Name(), err)
}
return nil
}
var errMissingFields = errors.New("missing fields")
func scanFdInfoReader(r io.Reader, fields map[string]interface{}) error {
var (
scanner = bufio.NewScanner(r)
scanned int
)
for scanner.Scan() {
parts := strings.SplitN(scanner.Text(), "\t", 2)
if len(parts) != 2 {
continue
}
name := strings.TrimSuffix(parts[0], ":")
field, ok := fields[string(name)]
if !ok {
continue
}
if n, err := fmt.Sscanln(parts[1], field); err != nil || n != 1 {
return fmt.Errorf("can't parse field %s: %v", name, err)
}
scanned++
}
if err := scanner.Err(); err != nil {
return err
}
if scanned != len(fields) {
return errMissingFields
}
return nil
}
// EnableStats starts the measuring of the runtime
// and run counts of eBPF programs.
//
// Collecting statistics can have an impact on the performance.
//
// Requires at least 5.8.
func EnableStats(which uint32) (io.Closer, error) {
attr := internal.BPFEnableStatsAttr{
StatsType: which,
}
fd, err := internal.BPFEnableStats(&attr)
if err != nil {
return nil, err
}
return fd, nil
}

View File

@ -0,0 +1,799 @@
package btf
import (
"bytes"
"debug/elf"
"encoding/binary"
"errors"
"fmt"
"io"
"io/ioutil"
"math"
"os"
"reflect"
"sync"
"unsafe"
"github.com/cilium/ebpf/internal"
"github.com/cilium/ebpf/internal/unix"
)
const btfMagic = 0xeB9F
// Errors returned by BTF functions.
var (
ErrNotSupported = internal.ErrNotSupported
ErrNotFound = errors.New("not found")
ErrNoExtendedInfo = errors.New("no extended info")
)
// Spec represents decoded BTF.
type Spec struct {
rawTypes []rawType
strings stringTable
types []Type
namedTypes map[string][]namedType
funcInfos map[string]extInfo
lineInfos map[string]extInfo
coreRelos map[string]coreRelos
byteOrder binary.ByteOrder
}
type btfHeader struct {
Magic uint16
Version uint8
Flags uint8
HdrLen uint32
TypeOff uint32
TypeLen uint32
StringOff uint32
StringLen uint32
}
// LoadSpecFromReader reads BTF sections from an ELF.
//
// Returns ErrNotFound if the reader contains no BTF.
func LoadSpecFromReader(rd io.ReaderAt) (*Spec, error) {
file, err := internal.NewSafeELFFile(rd)
if err != nil {
return nil, err
}
defer file.Close()
btfSection, btfExtSection, sectionSizes, err := findBtfSections(file)
if err != nil {
return nil, err
}
if btfSection == nil {
return nil, fmt.Errorf("btf: %w", ErrNotFound)
}
symbols, err := file.Symbols()
if err != nil {
return nil, fmt.Errorf("can't read symbols: %v", err)
}
variableOffsets := make(map[variable]uint32)
for _, symbol := range symbols {
if idx := symbol.Section; idx >= elf.SHN_LORESERVE && idx <= elf.SHN_HIRESERVE {
// Ignore things like SHN_ABS
continue
}
if int(symbol.Section) >= len(file.Sections) {
return nil, fmt.Errorf("symbol %s: invalid section %d", symbol.Name, symbol.Section)
}
secName := file.Sections[symbol.Section].Name
if _, ok := sectionSizes[secName]; !ok {
continue
}
if symbol.Value > math.MaxUint32 {
return nil, fmt.Errorf("section %s: symbol %s: size exceeds maximum", secName, symbol.Name)
}
variableOffsets[variable{secName, symbol.Name}] = uint32(symbol.Value)
}
spec, err := loadNakedSpec(btfSection.Open(), file.ByteOrder, sectionSizes, variableOffsets)
if err != nil {
return nil, err
}
if btfExtSection == nil {
return spec, nil
}
spec.funcInfos, spec.lineInfos, spec.coreRelos, err = parseExtInfos(btfExtSection.Open(), file.ByteOrder, spec.strings)
if err != nil {
return nil, fmt.Errorf("can't read ext info: %w", err)
}
return spec, nil
}
func findBtfSections(file *internal.SafeELFFile) (*elf.Section, *elf.Section, map[string]uint32, error) {
var (
btfSection *elf.Section
btfExtSection *elf.Section
sectionSizes = make(map[string]uint32)
)
for _, sec := range file.Sections {
switch sec.Name {
case ".BTF":
btfSection = sec
case ".BTF.ext":
btfExtSection = sec
default:
if sec.Type != elf.SHT_PROGBITS && sec.Type != elf.SHT_NOBITS {
break
}
if sec.Size > math.MaxUint32 {
return nil, nil, nil, fmt.Errorf("section %s exceeds maximum size", sec.Name)
}
sectionSizes[sec.Name] = uint32(sec.Size)
}
}
return btfSection, btfExtSection, sectionSizes, nil
}
func loadSpecFromVmlinux(rd io.ReaderAt) (*Spec, error) {
file, err := internal.NewSafeELFFile(rd)
if err != nil {
return nil, err
}
defer file.Close()
btfSection, _, _, err := findBtfSections(file)
if err != nil {
return nil, fmt.Errorf(".BTF ELF section: %s", err)
}
if btfSection == nil {
return nil, fmt.Errorf("unable to find .BTF ELF section")
}
return loadNakedSpec(btfSection.Open(), file.ByteOrder, nil, nil)
}
func loadNakedSpec(btf io.ReadSeeker, bo binary.ByteOrder, sectionSizes map[string]uint32, variableOffsets map[variable]uint32) (*Spec, error) {
rawTypes, rawStrings, err := parseBTF(btf, bo)
if err != nil {
return nil, err
}
err = fixupDatasec(rawTypes, rawStrings, sectionSizes, variableOffsets)
if err != nil {
return nil, err
}
types, typesByName, err := inflateRawTypes(rawTypes, rawStrings)
if err != nil {
return nil, err
}
return &Spec{
rawTypes: rawTypes,
namedTypes: typesByName,
types: types,
strings: rawStrings,
byteOrder: bo,
}, nil
}
var kernelBTF struct {
sync.Mutex
*Spec
}
// LoadKernelSpec returns the current kernel's BTF information.
//
// Requires a >= 5.5 kernel with CONFIG_DEBUG_INFO_BTF enabled. Returns
// ErrNotSupported if BTF is not enabled.
func LoadKernelSpec() (*Spec, error) {
kernelBTF.Lock()
defer kernelBTF.Unlock()
if kernelBTF.Spec != nil {
return kernelBTF.Spec, nil
}
var err error
kernelBTF.Spec, err = loadKernelSpec()
return kernelBTF.Spec, err
}
func loadKernelSpec() (*Spec, error) {
release, err := unix.KernelRelease()
if err != nil {
return nil, fmt.Errorf("can't read kernel release number: %w", err)
}
fh, err := os.Open("/sys/kernel/btf/vmlinux")
if err == nil {
defer fh.Close()
return loadNakedSpec(fh, internal.NativeEndian, nil, nil)
}
// use same list of locations as libbpf
// https://github.com/libbpf/libbpf/blob/9a3a42608dbe3731256a5682a125ac1e23bced8f/src/btf.c#L3114-L3122
locations := []string{
"/boot/vmlinux-%s",
"/lib/modules/%s/vmlinux-%[1]s",
"/lib/modules/%s/build/vmlinux",
"/usr/lib/modules/%s/kernel/vmlinux",
"/usr/lib/debug/boot/vmlinux-%s",
"/usr/lib/debug/boot/vmlinux-%s.debug",
"/usr/lib/debug/lib/modules/%s/vmlinux",
}
for _, loc := range locations {
path := fmt.Sprintf(loc, release)
fh, err := os.Open(path)
if err != nil {
continue
}
defer fh.Close()
return loadSpecFromVmlinux(fh)
}
return nil, fmt.Errorf("no BTF for kernel version %s: %w", release, internal.ErrNotSupported)
}
func parseBTF(btf io.ReadSeeker, bo binary.ByteOrder) ([]rawType, stringTable, error) {
rawBTF, err := ioutil.ReadAll(btf)
if err != nil {
return nil, nil, fmt.Errorf("can't read BTF: %v", err)
}
rd := bytes.NewReader(rawBTF)
var header btfHeader
if err := binary.Read(rd, bo, &header); err != nil {
return nil, nil, fmt.Errorf("can't read header: %v", err)
}
if header.Magic != btfMagic {
return nil, nil, fmt.Errorf("incorrect magic value %v", header.Magic)
}
if header.Version != 1 {
return nil, nil, fmt.Errorf("unexpected version %v", header.Version)
}
if header.Flags != 0 {
return nil, nil, fmt.Errorf("unsupported flags %v", header.Flags)
}
remainder := int64(header.HdrLen) - int64(binary.Size(&header))
if remainder < 0 {
return nil, nil, errors.New("header is too short")
}
if _, err := io.CopyN(internal.DiscardZeroes{}, rd, remainder); err != nil {
return nil, nil, fmt.Errorf("header padding: %v", err)
}
if _, err := rd.Seek(int64(header.HdrLen+header.StringOff), io.SeekStart); err != nil {
return nil, nil, fmt.Errorf("can't seek to start of string section: %v", err)
}
rawStrings, err := readStringTable(io.LimitReader(rd, int64(header.StringLen)))
if err != nil {
return nil, nil, fmt.Errorf("can't read type names: %w", err)
}
if _, err := rd.Seek(int64(header.HdrLen+header.TypeOff), io.SeekStart); err != nil {
return nil, nil, fmt.Errorf("can't seek to start of type section: %v", err)
}
rawTypes, err := readTypes(io.LimitReader(rd, int64(header.TypeLen)), bo)
if err != nil {
return nil, nil, fmt.Errorf("can't read types: %w", err)
}
return rawTypes, rawStrings, nil
}
type variable struct {
section string
name string
}
func fixupDatasec(rawTypes []rawType, rawStrings stringTable, sectionSizes map[string]uint32, variableOffsets map[variable]uint32) error {
for i, rawType := range rawTypes {
if rawType.Kind() != kindDatasec {
continue
}
name, err := rawStrings.Lookup(rawType.NameOff)
if err != nil {
return err
}
if name == ".kconfig" || name == ".ksyms" {
return fmt.Errorf("reference to %s: %w", name, ErrNotSupported)
}
if rawTypes[i].SizeType != 0 {
continue
}
size, ok := sectionSizes[name]
if !ok {
return fmt.Errorf("data section %s: missing size", name)
}
rawTypes[i].SizeType = size
secinfos := rawType.data.([]btfVarSecinfo)
for j, secInfo := range secinfos {
id := int(secInfo.Type - 1)
if id >= len(rawTypes) {
return fmt.Errorf("data section %s: invalid type id %d for variable %d", name, id, j)
}
varName, err := rawStrings.Lookup(rawTypes[id].NameOff)
if err != nil {
return fmt.Errorf("data section %s: can't get name for type %d: %w", name, id, err)
}
offset, ok := variableOffsets[variable{name, varName}]
if !ok {
return fmt.Errorf("data section %s: missing offset for variable %s", name, varName)
}
secinfos[j].Offset = offset
}
}
return nil
}
type marshalOpts struct {
ByteOrder binary.ByteOrder
StripFuncLinkage bool
}
func (s *Spec) marshal(opts marshalOpts) ([]byte, error) {
var (
buf bytes.Buffer
header = new(btfHeader)
headerLen = binary.Size(header)
)
// Reserve space for the header. We have to write it last since
// we don't know the size of the type section yet.
_, _ = buf.Write(make([]byte, headerLen))
// Write type section, just after the header.
for _, raw := range s.rawTypes {
switch {
case opts.StripFuncLinkage && raw.Kind() == kindFunc:
raw.SetLinkage(StaticFunc)
}
if err := raw.Marshal(&buf, opts.ByteOrder); err != nil {
return nil, fmt.Errorf("can't marshal BTF: %w", err)
}
}
typeLen := uint32(buf.Len() - headerLen)
// Write string section after type section.
_, _ = buf.Write(s.strings)
// Fill out the header, and write it out.
header = &btfHeader{
Magic: btfMagic,
Version: 1,
Flags: 0,
HdrLen: uint32(headerLen),
TypeOff: 0,
TypeLen: typeLen,
StringOff: typeLen,
StringLen: uint32(len(s.strings)),
}
raw := buf.Bytes()
err := binary.Write(sliceWriter(raw[:headerLen]), opts.ByteOrder, header)
if err != nil {
return nil, fmt.Errorf("can't write header: %v", err)
}
return raw, nil
}
type sliceWriter []byte
func (sw sliceWriter) Write(p []byte) (int, error) {
if len(p) != len(sw) {
return 0, errors.New("size doesn't match")
}
return copy(sw, p), nil
}
// Program finds the BTF for a specific section.
//
// Length is the number of bytes in the raw BPF instruction stream.
//
// Returns an error which may wrap ErrNoExtendedInfo if the Spec doesn't
// contain extended BTF info.
func (s *Spec) Program(name string, length uint64) (*Program, error) {
if length == 0 {
return nil, errors.New("length musn't be zero")
}
if s.funcInfos == nil && s.lineInfos == nil && s.coreRelos == nil {
return nil, fmt.Errorf("BTF for section %s: %w", name, ErrNoExtendedInfo)
}
funcInfos, funcOK := s.funcInfos[name]
lineInfos, lineOK := s.lineInfos[name]
relos, coreOK := s.coreRelos[name]
if !funcOK && !lineOK && !coreOK {
return nil, fmt.Errorf("no extended BTF info for section %s", name)
}
return &Program{s, length, funcInfos, lineInfos, relos}, nil
}
// Datasec returns the BTF required to create maps which represent data sections.
func (s *Spec) Datasec(name string) (*Map, error) {
var datasec Datasec
if err := s.FindType(name, &datasec); err != nil {
return nil, fmt.Errorf("data section %s: can't get BTF: %w", name, err)
}
m := NewMap(s, &Void{}, &datasec)
return &m, nil
}
// FindType searches for a type with a specific name.
//
// hint determines the type of the returned Type.
//
// Returns an error wrapping ErrNotFound if no matching
// type exists in spec.
func (s *Spec) FindType(name string, typ Type) error {
var (
wanted = reflect.TypeOf(typ)
candidate Type
)
for _, typ := range s.namedTypes[essentialName(name)] {
if reflect.TypeOf(typ) != wanted {
continue
}
// Match against the full name, not just the essential one.
if typ.name() != name {
continue
}
if candidate != nil {
return fmt.Errorf("type %s: multiple candidates for %T", name, typ)
}
candidate = typ
}
if candidate == nil {
return fmt.Errorf("type %s: %w", name, ErrNotFound)
}
cpy, _ := copyType(candidate, nil)
value := reflect.Indirect(reflect.ValueOf(cpy))
reflect.Indirect(reflect.ValueOf(typ)).Set(value)
return nil
}
// Handle is a reference to BTF loaded into the kernel.
type Handle struct {
fd *internal.FD
}
// NewHandle loads BTF into the kernel.
//
// Returns ErrNotSupported if BTF is not supported.
func NewHandle(spec *Spec) (*Handle, error) {
if err := haveBTF(); err != nil {
return nil, err
}
if spec.byteOrder != internal.NativeEndian {
return nil, fmt.Errorf("can't load %s BTF on %s", spec.byteOrder, internal.NativeEndian)
}
btf, err := spec.marshal(marshalOpts{
ByteOrder: internal.NativeEndian,
StripFuncLinkage: haveFuncLinkage() != nil,
})
if err != nil {
return nil, fmt.Errorf("can't marshal BTF: %w", err)
}
if uint64(len(btf)) > math.MaxUint32 {
return nil, errors.New("BTF exceeds the maximum size")
}
attr := &bpfLoadBTFAttr{
btf: internal.NewSlicePointer(btf),
btfSize: uint32(len(btf)),
}
fd, err := bpfLoadBTF(attr)
if err != nil {
logBuf := make([]byte, 64*1024)
attr.logBuf = internal.NewSlicePointer(logBuf)
attr.btfLogSize = uint32(len(logBuf))
attr.btfLogLevel = 1
_, logErr := bpfLoadBTF(attr)
return nil, internal.ErrorWithLog(err, logBuf, logErr)
}
return &Handle{fd}, nil
}
// Close destroys the handle.
//
// Subsequent calls to FD will return an invalid value.
func (h *Handle) Close() error {
return h.fd.Close()
}
// FD returns the file descriptor for the handle.
func (h *Handle) FD() int {
value, err := h.fd.Value()
if err != nil {
return -1
}
return int(value)
}
// Map is the BTF for a map.
type Map struct {
spec *Spec
key, value Type
}
// NewMap returns a new Map containing the given values.
// The key and value arguments are initialized to Void if nil values are given.
func NewMap(spec *Spec, key Type, value Type) Map {
if key == nil {
key = &Void{}
}
if value == nil {
value = &Void{}
}
return Map{
spec: spec,
key: key,
value: value,
}
}
// MapSpec should be a method on Map, but is a free function
// to hide it from users of the ebpf package.
func MapSpec(m *Map) *Spec {
return m.spec
}
// MapKey should be a method on Map, but is a free function
// to hide it from users of the ebpf package.
func MapKey(m *Map) Type {
return m.key
}
// MapValue should be a method on Map, but is a free function
// to hide it from users of the ebpf package.
func MapValue(m *Map) Type {
return m.value
}
// Program is the BTF information for a stream of instructions.
type Program struct {
spec *Spec
length uint64
funcInfos, lineInfos extInfo
coreRelos coreRelos
}
// ProgramSpec returns the Spec needed for loading function and line infos into the kernel.
//
// This is a free function instead of a method to hide it from users
// of package ebpf.
func ProgramSpec(s *Program) *Spec {
return s.spec
}
// ProgramAppend the information from other to the Program.
//
// This is a free function instead of a method to hide it from users
// of package ebpf.
func ProgramAppend(s, other *Program) error {
funcInfos, err := s.funcInfos.append(other.funcInfos, s.length)
if err != nil {
return fmt.Errorf("func infos: %w", err)
}
lineInfos, err := s.lineInfos.append(other.lineInfos, s.length)
if err != nil {
return fmt.Errorf("line infos: %w", err)
}
s.funcInfos = funcInfos
s.lineInfos = lineInfos
s.coreRelos = s.coreRelos.append(other.coreRelos, s.length)
s.length += other.length
return nil
}
// ProgramFuncInfos returns the binary form of BTF function infos.
//
// This is a free function instead of a method to hide it from users
// of package ebpf.
func ProgramFuncInfos(s *Program) (recordSize uint32, bytes []byte, err error) {
bytes, err = s.funcInfos.MarshalBinary()
if err != nil {
return 0, nil, err
}
return s.funcInfos.recordSize, bytes, nil
}
// ProgramLineInfos returns the binary form of BTF line infos.
//
// This is a free function instead of a method to hide it from users
// of package ebpf.
func ProgramLineInfos(s *Program) (recordSize uint32, bytes []byte, err error) {
bytes, err = s.lineInfos.MarshalBinary()
if err != nil {
return 0, nil, err
}
return s.lineInfos.recordSize, bytes, nil
}
// ProgramFixups returns the changes required to adjust the program to the target.
//
// This is a free function instead of a method to hide it from users
// of package ebpf.
func ProgramFixups(s *Program, target *Spec) (COREFixups, error) {
if len(s.coreRelos) == 0 {
return nil, nil
}
if target == nil {
var err error
target, err = LoadKernelSpec()
if err != nil {
return nil, err
}
}
return coreRelocate(s.spec, target, s.coreRelos)
}
type bpfLoadBTFAttr struct {
btf internal.Pointer
logBuf internal.Pointer
btfSize uint32
btfLogSize uint32
btfLogLevel uint32
}
func bpfLoadBTF(attr *bpfLoadBTFAttr) (*internal.FD, error) {
fd, err := internal.BPF(internal.BPF_BTF_LOAD, unsafe.Pointer(attr), unsafe.Sizeof(*attr))
if err != nil {
return nil, err
}
return internal.NewFD(uint32(fd)), nil
}
func marshalBTF(types interface{}, strings []byte, bo binary.ByteOrder) []byte {
const minHeaderLength = 24
typesLen := uint32(binary.Size(types))
header := btfHeader{
Magic: btfMagic,
Version: 1,
HdrLen: minHeaderLength,
TypeOff: 0,
TypeLen: typesLen,
StringOff: typesLen,
StringLen: uint32(len(strings)),
}
buf := new(bytes.Buffer)
_ = binary.Write(buf, bo, &header)
_ = binary.Write(buf, bo, types)
buf.Write(strings)
return buf.Bytes()
}
var haveBTF = internal.FeatureTest("BTF", "5.1", func() error {
var (
types struct {
Integer btfType
Var btfType
btfVar struct{ Linkage uint32 }
}
strings = []byte{0, 'a', 0}
)
// We use a BTF_KIND_VAR here, to make sure that
// the kernel understands BTF at least as well as we
// do. BTF_KIND_VAR was introduced ~5.1.
types.Integer.SetKind(kindPointer)
types.Var.NameOff = 1
types.Var.SetKind(kindVar)
types.Var.SizeType = 1
btf := marshalBTF(&types, strings, internal.NativeEndian)
fd, err := bpfLoadBTF(&bpfLoadBTFAttr{
btf: internal.NewSlicePointer(btf),
btfSize: uint32(len(btf)),
})
if errors.Is(err, unix.EINVAL) || errors.Is(err, unix.EPERM) {
// Treat both EINVAL and EPERM as not supported: loading the program
// might still succeed without BTF.
return internal.ErrNotSupported
}
if err != nil {
return err
}
fd.Close()
return nil
})
var haveFuncLinkage = internal.FeatureTest("BTF func linkage", "5.6", func() error {
if err := haveBTF(); err != nil {
return err
}
var (
types struct {
FuncProto btfType
Func btfType
}
strings = []byte{0, 'a', 0}
)
types.FuncProto.SetKind(kindFuncProto)
types.Func.SetKind(kindFunc)
types.Func.SizeType = 1 // aka FuncProto
types.Func.NameOff = 1
types.Func.SetLinkage(GlobalFunc)
btf := marshalBTF(&types, strings, internal.NativeEndian)
fd, err := bpfLoadBTF(&bpfLoadBTFAttr{
btf: internal.NewSlicePointer(btf),
btfSize: uint32(len(btf)),
})
if errors.Is(err, unix.EINVAL) {
return internal.ErrNotSupported
}
if err != nil {
return err
}
fd.Close()
return nil
})

View File

@ -0,0 +1,282 @@
package btf
import (
"encoding/binary"
"fmt"
"io"
)
//go:generate stringer -linecomment -output=btf_types_string.go -type=FuncLinkage,VarLinkage
// btfKind describes a Type.
type btfKind uint8
// Equivalents of the BTF_KIND_* constants.
const (
kindUnknown btfKind = iota
kindInt
kindPointer
kindArray
kindStruct
kindUnion
kindEnum
kindForward
kindTypedef
kindVolatile
kindConst
kindRestrict
// Added ~4.20
kindFunc
kindFuncProto
// Added ~5.1
kindVar
kindDatasec
)
// FuncLinkage describes BTF function linkage metadata.
type FuncLinkage int
// Equivalent of enum btf_func_linkage.
const (
StaticFunc FuncLinkage = iota // static
GlobalFunc // global
ExternFunc // extern
)
// VarLinkage describes BTF variable linkage metadata.
type VarLinkage int
const (
StaticVar VarLinkage = iota // static
GlobalVar // global
ExternVar // extern
)
const (
btfTypeKindShift = 24
btfTypeKindLen = 4
btfTypeVlenShift = 0
btfTypeVlenMask = 16
btfTypeKindFlagShift = 31
btfTypeKindFlagMask = 1
)
// btfType is equivalent to struct btf_type in Documentation/bpf/btf.rst.
type btfType struct {
NameOff uint32
/* "info" bits arrangement
* bits 0-15: vlen (e.g. # of struct's members), linkage
* bits 16-23: unused
* bits 24-27: kind (e.g. int, ptr, array...etc)
* bits 28-30: unused
* bit 31: kind_flag, currently used by
* struct, union and fwd
*/
Info uint32
/* "size" is used by INT, ENUM, STRUCT and UNION.
* "size" tells the size of the type it is describing.
*
* "type" is used by PTR, TYPEDEF, VOLATILE, CONST, RESTRICT,
* FUNC and FUNC_PROTO.
* "type" is a type_id referring to another type.
*/
SizeType uint32
}
func (k btfKind) String() string {
switch k {
case kindUnknown:
return "Unknown"
case kindInt:
return "Integer"
case kindPointer:
return "Pointer"
case kindArray:
return "Array"
case kindStruct:
return "Struct"
case kindUnion:
return "Union"
case kindEnum:
return "Enumeration"
case kindForward:
return "Forward"
case kindTypedef:
return "Typedef"
case kindVolatile:
return "Volatile"
case kindConst:
return "Const"
case kindRestrict:
return "Restrict"
case kindFunc:
return "Function"
case kindFuncProto:
return "Function Proto"
case kindVar:
return "Variable"
case kindDatasec:
return "Section"
default:
return fmt.Sprintf("Unknown (%d)", k)
}
}
func mask(len uint32) uint32 {
return (1 << len) - 1
}
func (bt *btfType) info(len, shift uint32) uint32 {
return (bt.Info >> shift) & mask(len)
}
func (bt *btfType) setInfo(value, len, shift uint32) {
bt.Info &^= mask(len) << shift
bt.Info |= (value & mask(len)) << shift
}
func (bt *btfType) Kind() btfKind {
return btfKind(bt.info(btfTypeKindLen, btfTypeKindShift))
}
func (bt *btfType) SetKind(kind btfKind) {
bt.setInfo(uint32(kind), btfTypeKindLen, btfTypeKindShift)
}
func (bt *btfType) Vlen() int {
return int(bt.info(btfTypeVlenMask, btfTypeVlenShift))
}
func (bt *btfType) SetVlen(vlen int) {
bt.setInfo(uint32(vlen), btfTypeVlenMask, btfTypeVlenShift)
}
func (bt *btfType) KindFlag() bool {
return bt.info(btfTypeKindFlagMask, btfTypeKindFlagShift) == 1
}
func (bt *btfType) Linkage() FuncLinkage {
return FuncLinkage(bt.info(btfTypeVlenMask, btfTypeVlenShift))
}
func (bt *btfType) SetLinkage(linkage FuncLinkage) {
bt.setInfo(uint32(linkage), btfTypeVlenMask, btfTypeVlenShift)
}
func (bt *btfType) Type() TypeID {
// TODO: Panic here if wrong kind?
return TypeID(bt.SizeType)
}
func (bt *btfType) Size() uint32 {
// TODO: Panic here if wrong kind?
return bt.SizeType
}
type rawType struct {
btfType
data interface{}
}
func (rt *rawType) Marshal(w io.Writer, bo binary.ByteOrder) error {
if err := binary.Write(w, bo, &rt.btfType); err != nil {
return err
}
if rt.data == nil {
return nil
}
return binary.Write(w, bo, rt.data)
}
type btfArray struct {
Type TypeID
IndexType TypeID
Nelems uint32
}
type btfMember struct {
NameOff uint32
Type TypeID
Offset uint32
}
type btfVarSecinfo struct {
Type TypeID
Offset uint32
Size uint32
}
type btfVariable struct {
Linkage uint32
}
type btfEnum struct {
NameOff uint32
Val int32
}
type btfParam struct {
NameOff uint32
Type TypeID
}
func readTypes(r io.Reader, bo binary.ByteOrder) ([]rawType, error) {
var (
header btfType
types []rawType
)
for id := TypeID(1); ; id++ {
if err := binary.Read(r, bo, &header); err == io.EOF {
return types, nil
} else if err != nil {
return nil, fmt.Errorf("can't read type info for id %v: %v", id, err)
}
var data interface{}
switch header.Kind() {
case kindInt:
data = new(uint32)
case kindPointer:
case kindArray:
data = new(btfArray)
case kindStruct:
fallthrough
case kindUnion:
data = make([]btfMember, header.Vlen())
case kindEnum:
data = make([]btfEnum, header.Vlen())
case kindForward:
case kindTypedef:
case kindVolatile:
case kindConst:
case kindRestrict:
case kindFunc:
case kindFuncProto:
data = make([]btfParam, header.Vlen())
case kindVar:
data = new(btfVariable)
case kindDatasec:
data = make([]btfVarSecinfo, header.Vlen())
default:
return nil, fmt.Errorf("type id %v: unknown kind: %v", id, header.Kind())
}
if data == nil {
types = append(types, rawType{header, nil})
continue
}
if err := binary.Read(r, bo, data); err != nil {
return nil, fmt.Errorf("type id %d: kind %v: can't read %T: %v", id, header.Kind(), data, err)
}
types = append(types, rawType{header, data})
}
}
func intEncoding(raw uint32) (IntEncoding, uint32, byte) {
return IntEncoding((raw & 0x0f000000) >> 24), (raw & 0x00ff0000) >> 16, byte(raw & 0x000000ff)
}

View File

@ -0,0 +1,44 @@
// Code generated by "stringer -linecomment -output=btf_types_string.go -type=FuncLinkage,VarLinkage"; DO NOT EDIT.
package btf
import "strconv"
func _() {
// An "invalid array index" compiler error signifies that the constant values have changed.
// Re-run the stringer command to generate them again.
var x [1]struct{}
_ = x[StaticFunc-0]
_ = x[GlobalFunc-1]
_ = x[ExternFunc-2]
}
const _FuncLinkage_name = "staticglobalextern"
var _FuncLinkage_index = [...]uint8{0, 6, 12, 18}
func (i FuncLinkage) String() string {
if i < 0 || i >= FuncLinkage(len(_FuncLinkage_index)-1) {
return "FuncLinkage(" + strconv.FormatInt(int64(i), 10) + ")"
}
return _FuncLinkage_name[_FuncLinkage_index[i]:_FuncLinkage_index[i+1]]
}
func _() {
// An "invalid array index" compiler error signifies that the constant values have changed.
// Re-run the stringer command to generate them again.
var x [1]struct{}
_ = x[StaticVar-0]
_ = x[GlobalVar-1]
_ = x[ExternVar-2]
}
const _VarLinkage_name = "staticglobalextern"
var _VarLinkage_index = [...]uint8{0, 6, 12, 18}
func (i VarLinkage) String() string {
if i < 0 || i >= VarLinkage(len(_VarLinkage_index)-1) {
return "VarLinkage(" + strconv.FormatInt(int64(i), 10) + ")"
}
return _VarLinkage_name[_VarLinkage_index[i]:_VarLinkage_index[i+1]]
}

View File

@ -0,0 +1,887 @@
package btf
import (
"errors"
"fmt"
"math"
"reflect"
"sort"
"strconv"
"strings"
"github.com/cilium/ebpf/asm"
)
// Code in this file is derived from libbpf, which is available under a BSD
// 2-Clause license.
// COREFixup is the result of computing a CO-RE relocation for a target.
type COREFixup struct {
Kind COREKind
Local uint32
Target uint32
Poison bool
}
func (f COREFixup) equal(other COREFixup) bool {
return f.Local == other.Local && f.Target == other.Target
}
func (f COREFixup) String() string {
if f.Poison {
return fmt.Sprintf("%s=poison", f.Kind)
}
return fmt.Sprintf("%s=%d->%d", f.Kind, f.Local, f.Target)
}
func (f COREFixup) apply(ins *asm.Instruction) error {
if f.Poison {
return errors.New("can't poison individual instruction")
}
switch class := ins.OpCode.Class(); class {
case asm.LdXClass, asm.StClass, asm.StXClass:
if want := int16(f.Local); want != ins.Offset {
return fmt.Errorf("invalid offset %d, expected %d", ins.Offset, want)
}
if f.Target > math.MaxInt16 {
return fmt.Errorf("offset %d exceeds MaxInt16", f.Target)
}
ins.Offset = int16(f.Target)
case asm.LdClass:
if !ins.IsConstantLoad(asm.DWord) {
return fmt.Errorf("not a dword-sized immediate load")
}
if want := int64(f.Local); want != ins.Constant {
return fmt.Errorf("invalid immediate %d, expected %d", ins.Constant, want)
}
ins.Constant = int64(f.Target)
case asm.ALUClass:
if ins.OpCode.ALUOp() == asm.Swap {
return fmt.Errorf("relocation against swap")
}
fallthrough
case asm.ALU64Class:
if src := ins.OpCode.Source(); src != asm.ImmSource {
return fmt.Errorf("invalid source %s", src)
}
if want := int64(f.Local); want != ins.Constant {
return fmt.Errorf("invalid immediate %d, expected %d", ins.Constant, want)
}
if f.Target > math.MaxInt32 {
return fmt.Errorf("immediate %d exceeds MaxInt32", f.Target)
}
ins.Constant = int64(f.Target)
default:
return fmt.Errorf("invalid class %s", class)
}
return nil
}
func (f COREFixup) isNonExistant() bool {
return f.Kind.checksForExistence() && f.Target == 0
}
type COREFixups map[uint64]COREFixup
// Apply a set of CO-RE relocations to a BPF program.
func (fs COREFixups) Apply(insns asm.Instructions) (asm.Instructions, error) {
if len(fs) == 0 {
cpy := make(asm.Instructions, len(insns))
copy(cpy, insns)
return insns, nil
}
cpy := make(asm.Instructions, 0, len(insns))
iter := insns.Iterate()
for iter.Next() {
fixup, ok := fs[iter.Offset.Bytes()]
if !ok {
cpy = append(cpy, *iter.Ins)
continue
}
ins := *iter.Ins
if fixup.Poison {
const badRelo = asm.BuiltinFunc(0xbad2310)
cpy = append(cpy, badRelo.Call())
if ins.OpCode.IsDWordLoad() {
// 64 bit constant loads occupy two raw bpf instructions, so
// we need to add another instruction as padding.
cpy = append(cpy, badRelo.Call())
}
continue
}
if err := fixup.apply(&ins); err != nil {
return nil, fmt.Errorf("instruction %d, offset %d: %s: %w", iter.Index, iter.Offset.Bytes(), fixup.Kind, err)
}
cpy = append(cpy, ins)
}
return cpy, nil
}
// COREKind is the type of CO-RE relocation
type COREKind uint32
const (
reloFieldByteOffset COREKind = iota /* field byte offset */
reloFieldByteSize /* field size in bytes */
reloFieldExists /* field existence in target kernel */
reloFieldSigned /* field signedness (0 - unsigned, 1 - signed) */
reloFieldLShiftU64 /* bitfield-specific left bitshift */
reloFieldRShiftU64 /* bitfield-specific right bitshift */
reloTypeIDLocal /* type ID in local BPF object */
reloTypeIDTarget /* type ID in target kernel */
reloTypeExists /* type existence in target kernel */
reloTypeSize /* type size in bytes */
reloEnumvalExists /* enum value existence in target kernel */
reloEnumvalValue /* enum value integer value */
)
func (k COREKind) String() string {
switch k {
case reloFieldByteOffset:
return "byte_off"
case reloFieldByteSize:
return "byte_sz"
case reloFieldExists:
return "field_exists"
case reloFieldSigned:
return "signed"
case reloFieldLShiftU64:
return "lshift_u64"
case reloFieldRShiftU64:
return "rshift_u64"
case reloTypeIDLocal:
return "local_type_id"
case reloTypeIDTarget:
return "target_type_id"
case reloTypeExists:
return "type_exists"
case reloTypeSize:
return "type_size"
case reloEnumvalExists:
return "enumval_exists"
case reloEnumvalValue:
return "enumval_value"
default:
return "unknown"
}
}
func (k COREKind) checksForExistence() bool {
return k == reloEnumvalExists || k == reloTypeExists || k == reloFieldExists
}
func coreRelocate(local, target *Spec, relos coreRelos) (COREFixups, error) {
if local.byteOrder != target.byteOrder {
return nil, fmt.Errorf("can't relocate %s against %s", local.byteOrder, target.byteOrder)
}
var ids []TypeID
relosByID := make(map[TypeID]coreRelos)
result := make(COREFixups, len(relos))
for _, relo := range relos {
if relo.kind == reloTypeIDLocal {
// Filtering out reloTypeIDLocal here makes our lives a lot easier
// down the line, since it doesn't have a target at all.
if len(relo.accessor) > 1 || relo.accessor[0] != 0 {
return nil, fmt.Errorf("%s: unexpected accessor %v", relo.kind, relo.accessor)
}
result[uint64(relo.insnOff)] = COREFixup{
relo.kind,
uint32(relo.typeID),
uint32(relo.typeID),
false,
}
continue
}
relos, ok := relosByID[relo.typeID]
if !ok {
ids = append(ids, relo.typeID)
}
relosByID[relo.typeID] = append(relos, relo)
}
// Ensure we work on relocations in a deterministic order.
sort.Slice(ids, func(i, j int) bool {
return ids[i] < ids[j]
})
for _, id := range ids {
if int(id) >= len(local.types) {
return nil, fmt.Errorf("invalid type id %d", id)
}
localType := local.types[id]
named, ok := localType.(namedType)
if !ok || named.name() == "" {
return nil, fmt.Errorf("relocate unnamed or anonymous type %s: %w", localType, ErrNotSupported)
}
relos := relosByID[id]
targets := target.namedTypes[named.essentialName()]
fixups, err := coreCalculateFixups(localType, targets, relos)
if err != nil {
return nil, fmt.Errorf("relocate %s: %w", localType, err)
}
for i, relo := range relos {
result[uint64(relo.insnOff)] = fixups[i]
}
}
return result, nil
}
var errAmbiguousRelocation = errors.New("ambiguous relocation")
var errImpossibleRelocation = errors.New("impossible relocation")
// coreCalculateFixups calculates the fixups for the given relocations using
// the "best" target.
//
// The best target is determined by scoring: the less poisoning we have to do
// the better the target is.
func coreCalculateFixups(local Type, targets []namedType, relos coreRelos) ([]COREFixup, error) {
localID := local.ID()
local, err := copyType(local, skipQualifierAndTypedef)
if err != nil {
return nil, err
}
bestScore := len(relos)
var bestFixups []COREFixup
for i := range targets {
targetID := targets[i].ID()
target, err := copyType(targets[i], skipQualifierAndTypedef)
if err != nil {
return nil, err
}
score := 0 // lower is better
fixups := make([]COREFixup, 0, len(relos))
for _, relo := range relos {
fixup, err := coreCalculateFixup(local, localID, target, targetID, relo)
if err != nil {
return nil, fmt.Errorf("target %s: %w", target, err)
}
if fixup.Poison || fixup.isNonExistant() {
score++
}
fixups = append(fixups, fixup)
}
if score > bestScore {
// We have a better target already, ignore this one.
continue
}
if score < bestScore {
// This is the best target yet, use it.
bestScore = score
bestFixups = fixups
continue
}
// Some other target has the same score as the current one. Make sure
// the fixups agree with each other.
for i, fixup := range bestFixups {
if !fixup.equal(fixups[i]) {
return nil, fmt.Errorf("%s: multiple types match: %w", fixup.Kind, errAmbiguousRelocation)
}
}
}
if bestFixups == nil {
// Nothing at all matched, probably because there are no suitable
// targets at all. Poison everything!
bestFixups = make([]COREFixup, len(relos))
for i, relo := range relos {
bestFixups[i] = COREFixup{Kind: relo.kind, Poison: true}
}
}
return bestFixups, nil
}
// coreCalculateFixup calculates the fixup for a single local type, target type
// and relocation.
func coreCalculateFixup(local Type, localID TypeID, target Type, targetID TypeID, relo coreRelo) (COREFixup, error) {
fixup := func(local, target uint32) (COREFixup, error) {
return COREFixup{relo.kind, local, target, false}, nil
}
poison := func() (COREFixup, error) {
if relo.kind.checksForExistence() {
return fixup(1, 0)
}
return COREFixup{relo.kind, 0, 0, true}, nil
}
zero := COREFixup{}
switch relo.kind {
case reloTypeIDTarget, reloTypeSize, reloTypeExists:
if len(relo.accessor) > 1 || relo.accessor[0] != 0 {
return zero, fmt.Errorf("%s: unexpected accessor %v", relo.kind, relo.accessor)
}
err := coreAreTypesCompatible(local, target)
if errors.Is(err, errImpossibleRelocation) {
return poison()
}
if err != nil {
return zero, fmt.Errorf("relocation %s: %w", relo.kind, err)
}
switch relo.kind {
case reloTypeExists:
return fixup(1, 1)
case reloTypeIDTarget:
return fixup(uint32(localID), uint32(targetID))
case reloTypeSize:
localSize, err := Sizeof(local)
if err != nil {
return zero, err
}
targetSize, err := Sizeof(target)
if err != nil {
return zero, err
}
return fixup(uint32(localSize), uint32(targetSize))
}
case reloEnumvalValue, reloEnumvalExists:
localValue, targetValue, err := coreFindEnumValue(local, relo.accessor, target)
if errors.Is(err, errImpossibleRelocation) {
return poison()
}
if err != nil {
return zero, fmt.Errorf("relocation %s: %w", relo.kind, err)
}
switch relo.kind {
case reloEnumvalExists:
return fixup(1, 1)
case reloEnumvalValue:
return fixup(uint32(localValue.Value), uint32(targetValue.Value))
}
case reloFieldByteOffset, reloFieldByteSize, reloFieldExists:
if _, ok := target.(*Fwd); ok {
// We can't relocate fields using a forward declaration, so
// skip it. If a non-forward declaration is present in the BTF
// we'll find it in one of the other iterations.
return poison()
}
localField, targetField, err := coreFindField(local, relo.accessor, target)
if errors.Is(err, errImpossibleRelocation) {
return poison()
}
if err != nil {
return zero, fmt.Errorf("target %s: %w", target, err)
}
switch relo.kind {
case reloFieldExists:
return fixup(1, 1)
case reloFieldByteOffset:
return fixup(localField.offset/8, targetField.offset/8)
case reloFieldByteSize:
localSize, err := Sizeof(localField.Type)
if err != nil {
return zero, err
}
targetSize, err := Sizeof(targetField.Type)
if err != nil {
return zero, err
}
return fixup(uint32(localSize), uint32(targetSize))
}
}
return zero, fmt.Errorf("relocation %s: %w", relo.kind, ErrNotSupported)
}
/* coreAccessor contains a path through a struct. It contains at least one index.
*
* The interpretation depends on the kind of the relocation. The following is
* taken from struct bpf_core_relo in libbpf_internal.h:
*
* - for field-based relocations, string encodes an accessed field using
* a sequence of field and array indices, separated by colon (:). It's
* conceptually very close to LLVM's getelementptr ([0]) instruction's
* arguments for identifying offset to a field.
* - for type-based relocations, strings is expected to be just "0";
* - for enum value-based relocations, string contains an index of enum
* value within its enum type;
*
* Example to provide a better feel.
*
* struct sample {
* int a;
* struct {
* int b[10];
* };
* };
*
* struct sample s = ...;
* int x = &s->a; // encoded as "0:0" (a is field #0)
* int y = &s->b[5]; // encoded as "0:1:0:5" (anon struct is field #1,
* // b is field #0 inside anon struct, accessing elem #5)
* int z = &s[10]->b; // encoded as "10:1" (ptr is used as an array)
*/
type coreAccessor []int
func parseCoreAccessor(accessor string) (coreAccessor, error) {
if accessor == "" {
return nil, fmt.Errorf("empty accessor")
}
var result coreAccessor
parts := strings.Split(accessor, ":")
for _, part := range parts {
// 31 bits to avoid overflowing int on 32 bit platforms.
index, err := strconv.ParseUint(part, 10, 31)
if err != nil {
return nil, fmt.Errorf("accessor index %q: %s", part, err)
}
result = append(result, int(index))
}
return result, nil
}
func (ca coreAccessor) String() string {
strs := make([]string, 0, len(ca))
for _, i := range ca {
strs = append(strs, strconv.Itoa(i))
}
return strings.Join(strs, ":")
}
func (ca coreAccessor) enumValue(t Type) (*EnumValue, error) {
e, ok := t.(*Enum)
if !ok {
return nil, fmt.Errorf("not an enum: %s", t)
}
if len(ca) > 1 {
return nil, fmt.Errorf("invalid accessor %s for enum", ca)
}
i := ca[0]
if i >= len(e.Values) {
return nil, fmt.Errorf("invalid index %d for %s", i, e)
}
return &e.Values[i], nil
}
type coreField struct {
Type Type
offset uint32
}
func adjustOffset(base uint32, t Type, n int) (uint32, error) {
size, err := Sizeof(t)
if err != nil {
return 0, err
}
return base + (uint32(n) * uint32(size) * 8), nil
}
// coreFindField descends into the local type using the accessor and tries to
// find an equivalent field in target at each step.
//
// Returns the field and the offset of the field from the start of
// target in bits.
func coreFindField(local Type, localAcc coreAccessor, target Type) (_, _ coreField, _ error) {
// The first index is used to offset a pointer of the base type like
// when accessing an array.
localOffset, err := adjustOffset(0, local, localAcc[0])
if err != nil {
return coreField{}, coreField{}, err
}
targetOffset, err := adjustOffset(0, target, localAcc[0])
if err != nil {
return coreField{}, coreField{}, err
}
if err := coreAreMembersCompatible(local, target); err != nil {
return coreField{}, coreField{}, fmt.Errorf("fields: %w", err)
}
var localMaybeFlex, targetMaybeFlex bool
for _, acc := range localAcc[1:] {
switch localType := local.(type) {
case composite:
// For composite types acc is used to find the field in the local type,
// and then we try to find a field in target with the same name.
localMembers := localType.members()
if acc >= len(localMembers) {
return coreField{}, coreField{}, fmt.Errorf("invalid accessor %d for %s", acc, local)
}
localMember := localMembers[acc]
if localMember.Name == "" {
_, ok := localMember.Type.(composite)
if !ok {
return coreField{}, coreField{}, fmt.Errorf("unnamed field with type %s: %s", localMember.Type, ErrNotSupported)
}
// This is an anonymous struct or union, ignore it.
local = localMember.Type
localOffset += localMember.Offset
localMaybeFlex = false
continue
}
targetType, ok := target.(composite)
if !ok {
return coreField{}, coreField{}, fmt.Errorf("target not composite: %w", errImpossibleRelocation)
}
targetMember, last, err := coreFindMember(targetType, localMember.Name)
if err != nil {
return coreField{}, coreField{}, err
}
if targetMember.BitfieldSize > 0 {
return coreField{}, coreField{}, fmt.Errorf("field %q is a bitfield: %w", targetMember.Name, ErrNotSupported)
}
local = localMember.Type
localMaybeFlex = acc == len(localMembers)-1
localOffset += localMember.Offset
target = targetMember.Type
targetMaybeFlex = last
targetOffset += targetMember.Offset
case *Array:
// For arrays, acc is the index in the target.
targetType, ok := target.(*Array)
if !ok {
return coreField{}, coreField{}, fmt.Errorf("target not array: %w", errImpossibleRelocation)
}
if localType.Nelems == 0 && !localMaybeFlex {
return coreField{}, coreField{}, fmt.Errorf("local type has invalid flexible array")
}
if targetType.Nelems == 0 && !targetMaybeFlex {
return coreField{}, coreField{}, fmt.Errorf("target type has invalid flexible array")
}
if localType.Nelems > 0 && acc >= int(localType.Nelems) {
return coreField{}, coreField{}, fmt.Errorf("invalid access of %s at index %d", localType, acc)
}
if targetType.Nelems > 0 && acc >= int(targetType.Nelems) {
return coreField{}, coreField{}, fmt.Errorf("out of bounds access of target: %w", errImpossibleRelocation)
}
local = localType.Type
localMaybeFlex = false
localOffset, err = adjustOffset(localOffset, local, acc)
if err != nil {
return coreField{}, coreField{}, err
}
target = targetType.Type
targetMaybeFlex = false
targetOffset, err = adjustOffset(targetOffset, target, acc)
if err != nil {
return coreField{}, coreField{}, err
}
default:
return coreField{}, coreField{}, fmt.Errorf("relocate field of %T: %w", localType, ErrNotSupported)
}
if err := coreAreMembersCompatible(local, target); err != nil {
return coreField{}, coreField{}, err
}
}
return coreField{local, localOffset}, coreField{target, targetOffset}, nil
}
// coreFindMember finds a member in a composite type while handling anonymous
// structs and unions.
func coreFindMember(typ composite, name Name) (Member, bool, error) {
if name == "" {
return Member{}, false, errors.New("can't search for anonymous member")
}
type offsetTarget struct {
composite
offset uint32
}
targets := []offsetTarget{{typ, 0}}
visited := make(map[composite]bool)
for i := 0; i < len(targets); i++ {
target := targets[i]
// Only visit targets once to prevent infinite recursion.
if visited[target] {
continue
}
if len(visited) >= maxTypeDepth {
// This check is different than libbpf, which restricts the entire
// path to BPF_CORE_SPEC_MAX_LEN items.
return Member{}, false, fmt.Errorf("type is nested too deep")
}
visited[target] = true
members := target.members()
for j, member := range members {
if member.Name == name {
// NB: This is safe because member is a copy.
member.Offset += target.offset
return member, j == len(members)-1, nil
}
// The names don't match, but this member could be an anonymous struct
// or union.
if member.Name != "" {
continue
}
comp, ok := member.Type.(composite)
if !ok {
return Member{}, false, fmt.Errorf("anonymous non-composite type %T not allowed", member.Type)
}
targets = append(targets, offsetTarget{comp, target.offset + member.Offset})
}
}
return Member{}, false, fmt.Errorf("no matching member: %w", errImpossibleRelocation)
}
// coreFindEnumValue follows localAcc to find the equivalent enum value in target.
func coreFindEnumValue(local Type, localAcc coreAccessor, target Type) (localValue, targetValue *EnumValue, _ error) {
localValue, err := localAcc.enumValue(local)
if err != nil {
return nil, nil, err
}
targetEnum, ok := target.(*Enum)
if !ok {
return nil, nil, errImpossibleRelocation
}
localName := localValue.Name.essentialName()
for i, targetValue := range targetEnum.Values {
if targetValue.Name.essentialName() != localName {
continue
}
return localValue, &targetEnum.Values[i], nil
}
return nil, nil, errImpossibleRelocation
}
/* The comment below is from bpf_core_types_are_compat in libbpf.c:
*
* Check local and target types for compatibility. This check is used for
* type-based CO-RE relocations and follow slightly different rules than
* field-based relocations. This function assumes that root types were already
* checked for name match. Beyond that initial root-level name check, names
* are completely ignored. Compatibility rules are as follows:
* - any two STRUCTs/UNIONs/FWDs/ENUMs/INTs are considered compatible, but
* kind should match for local and target types (i.e., STRUCT is not
* compatible with UNION);
* - for ENUMs, the size is ignored;
* - for INT, size and signedness are ignored;
* - for ARRAY, dimensionality is ignored, element types are checked for
* compatibility recursively;
* - CONST/VOLATILE/RESTRICT modifiers are ignored;
* - TYPEDEFs/PTRs are compatible if types they pointing to are compatible;
* - FUNC_PROTOs are compatible if they have compatible signature: same
* number of input args and compatible return and argument types.
* These rules are not set in stone and probably will be adjusted as we get
* more experience with using BPF CO-RE relocations.
*
* Returns errImpossibleRelocation if types are not compatible.
*/
func coreAreTypesCompatible(localType Type, targetType Type) error {
var (
localTs, targetTs typeDeque
l, t = &localType, &targetType
depth = 0
)
for ; l != nil && t != nil; l, t = localTs.shift(), targetTs.shift() {
if depth >= maxTypeDepth {
return errors.New("types are nested too deep")
}
localType = *l
targetType = *t
if reflect.TypeOf(localType) != reflect.TypeOf(targetType) {
return fmt.Errorf("type mismatch: %w", errImpossibleRelocation)
}
switch lv := (localType).(type) {
case *Void, *Struct, *Union, *Enum, *Fwd:
// Nothing to do here
case *Int:
tv := targetType.(*Int)
if lv.isBitfield() || tv.isBitfield() {
return fmt.Errorf("bitfield: %w", errImpossibleRelocation)
}
case *Pointer, *Array:
depth++
localType.walk(&localTs)
targetType.walk(&targetTs)
case *FuncProto:
tv := targetType.(*FuncProto)
if len(lv.Params) != len(tv.Params) {
return fmt.Errorf("function param mismatch: %w", errImpossibleRelocation)
}
depth++
localType.walk(&localTs)
targetType.walk(&targetTs)
default:
return fmt.Errorf("unsupported type %T", localType)
}
}
if l != nil {
return fmt.Errorf("dangling local type %T", *l)
}
if t != nil {
return fmt.Errorf("dangling target type %T", *t)
}
return nil
}
/* coreAreMembersCompatible checks two types for field-based relocation compatibility.
*
* The comment below is from bpf_core_fields_are_compat in libbpf.c:
*
* Check two types for compatibility for the purpose of field access
* relocation. const/volatile/restrict and typedefs are skipped to ensure we
* are relocating semantically compatible entities:
* - any two STRUCTs/UNIONs are compatible and can be mixed;
* - any two FWDs are compatible, if their names match (modulo flavor suffix);
* - any two PTRs are always compatible;
* - for ENUMs, names should be the same (ignoring flavor suffix) or at
* least one of enums should be anonymous;
* - for ENUMs, check sizes, names are ignored;
* - for INT, size and signedness are ignored;
* - for ARRAY, dimensionality is ignored, element types are checked for
* compatibility recursively;
* [ NB: coreAreMembersCompatible doesn't recurse, this check is done
* by coreFindField. ]
* - everything else shouldn't be ever a target of relocation.
* These rules are not set in stone and probably will be adjusted as we get
* more experience with using BPF CO-RE relocations.
*
* Returns errImpossibleRelocation if the members are not compatible.
*/
func coreAreMembersCompatible(localType Type, targetType Type) error {
doNamesMatch := func(a, b string) error {
if a == "" || b == "" {
// allow anonymous and named type to match
return nil
}
if essentialName(a) == essentialName(b) {
return nil
}
return fmt.Errorf("names don't match: %w", errImpossibleRelocation)
}
_, lok := localType.(composite)
_, tok := targetType.(composite)
if lok && tok {
return nil
}
if reflect.TypeOf(localType) != reflect.TypeOf(targetType) {
return fmt.Errorf("type mismatch: %w", errImpossibleRelocation)
}
switch lv := localType.(type) {
case *Array, *Pointer:
return nil
case *Enum:
tv := targetType.(*Enum)
return doNamesMatch(lv.name(), tv.name())
case *Fwd:
tv := targetType.(*Fwd)
return doNamesMatch(lv.name(), tv.name())
case *Int:
tv := targetType.(*Int)
if lv.isBitfield() || tv.isBitfield() {
return fmt.Errorf("bitfield: %w", errImpossibleRelocation)
}
return nil
default:
return fmt.Errorf("type %s: %w", localType, ErrNotSupported)
}
}
func skipQualifierAndTypedef(typ Type) (Type, error) {
result := typ
for depth := 0; depth <= maxTypeDepth; depth++ {
switch v := (result).(type) {
case qualifier:
result = v.qualify()
case *Typedef:
result = v.Type
default:
return result, nil
}
}
return nil, errors.New("exceeded type depth")
}

View File

@ -0,0 +1,8 @@
// Package btf handles data encoded according to the BPF Type Format.
//
// The canonical documentation lives in the Linux kernel repository and is
// available at https://www.kernel.org/doc/html/latest/bpf/btf.html
//
// The API is very much unstable. You should only use this via the main
// ebpf library.
package btf

View File

@ -0,0 +1,303 @@
package btf
import (
"bufio"
"bytes"
"encoding/binary"
"errors"
"fmt"
"io"
"io/ioutil"
"github.com/cilium/ebpf/asm"
"github.com/cilium/ebpf/internal"
)
type btfExtHeader struct {
Magic uint16
Version uint8
Flags uint8
HdrLen uint32
FuncInfoOff uint32
FuncInfoLen uint32
LineInfoOff uint32
LineInfoLen uint32
}
type btfExtCoreHeader struct {
CoreReloOff uint32
CoreReloLen uint32
}
func parseExtInfos(r io.ReadSeeker, bo binary.ByteOrder, strings stringTable) (funcInfo, lineInfo map[string]extInfo, relos map[string]coreRelos, err error) {
var header btfExtHeader
var coreHeader btfExtCoreHeader
if err := binary.Read(r, bo, &header); err != nil {
return nil, nil, nil, fmt.Errorf("can't read header: %v", err)
}
if header.Magic != btfMagic {
return nil, nil, nil, fmt.Errorf("incorrect magic value %v", header.Magic)
}
if header.Version != 1 {
return nil, nil, nil, fmt.Errorf("unexpected version %v", header.Version)
}
if header.Flags != 0 {
return nil, nil, nil, fmt.Errorf("unsupported flags %v", header.Flags)
}
remainder := int64(header.HdrLen) - int64(binary.Size(&header))
if remainder < 0 {
return nil, nil, nil, errors.New("header is too short")
}
coreHdrSize := int64(binary.Size(&coreHeader))
if remainder >= coreHdrSize {
if err := binary.Read(r, bo, &coreHeader); err != nil {
return nil, nil, nil, fmt.Errorf("can't read CO-RE relocation header: %v", err)
}
remainder -= coreHdrSize
}
// Of course, the .BTF.ext header has different semantics than the
// .BTF ext header. We need to ignore non-null values.
_, err = io.CopyN(ioutil.Discard, r, remainder)
if err != nil {
return nil, nil, nil, fmt.Errorf("header padding: %v", err)
}
if _, err := r.Seek(int64(header.HdrLen+header.FuncInfoOff), io.SeekStart); err != nil {
return nil, nil, nil, fmt.Errorf("can't seek to function info section: %v", err)
}
buf := bufio.NewReader(io.LimitReader(r, int64(header.FuncInfoLen)))
funcInfo, err = parseExtInfo(buf, bo, strings)
if err != nil {
return nil, nil, nil, fmt.Errorf("function info: %w", err)
}
if _, err := r.Seek(int64(header.HdrLen+header.LineInfoOff), io.SeekStart); err != nil {
return nil, nil, nil, fmt.Errorf("can't seek to line info section: %v", err)
}
buf = bufio.NewReader(io.LimitReader(r, int64(header.LineInfoLen)))
lineInfo, err = parseExtInfo(buf, bo, strings)
if err != nil {
return nil, nil, nil, fmt.Errorf("line info: %w", err)
}
if coreHeader.CoreReloOff > 0 && coreHeader.CoreReloLen > 0 {
if _, err := r.Seek(int64(header.HdrLen+coreHeader.CoreReloOff), io.SeekStart); err != nil {
return nil, nil, nil, fmt.Errorf("can't seek to CO-RE relocation section: %v", err)
}
relos, err = parseExtInfoRelos(io.LimitReader(r, int64(coreHeader.CoreReloLen)), bo, strings)
if err != nil {
return nil, nil, nil, fmt.Errorf("CO-RE relocation info: %w", err)
}
}
return funcInfo, lineInfo, relos, nil
}
type btfExtInfoSec struct {
SecNameOff uint32
NumInfo uint32
}
type extInfoRecord struct {
InsnOff uint64
Opaque []byte
}
type extInfo struct {
recordSize uint32
records []extInfoRecord
}
func (ei extInfo) append(other extInfo, offset uint64) (extInfo, error) {
if other.recordSize != ei.recordSize {
return extInfo{}, fmt.Errorf("ext_info record size mismatch, want %d (got %d)", ei.recordSize, other.recordSize)
}
records := make([]extInfoRecord, 0, len(ei.records)+len(other.records))
records = append(records, ei.records...)
for _, info := range other.records {
records = append(records, extInfoRecord{
InsnOff: info.InsnOff + offset,
Opaque: info.Opaque,
})
}
return extInfo{ei.recordSize, records}, nil
}
func (ei extInfo) MarshalBinary() ([]byte, error) {
if len(ei.records) == 0 {
return nil, nil
}
buf := bytes.NewBuffer(make([]byte, 0, int(ei.recordSize)*len(ei.records)))
for _, info := range ei.records {
// The kernel expects offsets in number of raw bpf instructions,
// while the ELF tracks it in bytes.
insnOff := uint32(info.InsnOff / asm.InstructionSize)
if err := binary.Write(buf, internal.NativeEndian, insnOff); err != nil {
return nil, fmt.Errorf("can't write instruction offset: %v", err)
}
buf.Write(info.Opaque)
}
return buf.Bytes(), nil
}
func parseExtInfo(r io.Reader, bo binary.ByteOrder, strings stringTable) (map[string]extInfo, error) {
const maxRecordSize = 256
var recordSize uint32
if err := binary.Read(r, bo, &recordSize); err != nil {
return nil, fmt.Errorf("can't read record size: %v", err)
}
if recordSize < 4 {
// Need at least insnOff
return nil, errors.New("record size too short")
}
if recordSize > maxRecordSize {
return nil, fmt.Errorf("record size %v exceeds %v", recordSize, maxRecordSize)
}
result := make(map[string]extInfo)
for {
secName, infoHeader, err := parseExtInfoHeader(r, bo, strings)
if errors.Is(err, io.EOF) {
return result, nil
}
var records []extInfoRecord
for i := uint32(0); i < infoHeader.NumInfo; i++ {
var byteOff uint32
if err := binary.Read(r, bo, &byteOff); err != nil {
return nil, fmt.Errorf("section %v: can't read extended info offset: %v", secName, err)
}
buf := make([]byte, int(recordSize-4))
if _, err := io.ReadFull(r, buf); err != nil {
return nil, fmt.Errorf("section %v: can't read record: %v", secName, err)
}
if byteOff%asm.InstructionSize != 0 {
return nil, fmt.Errorf("section %v: offset %v is not aligned with instruction size", secName, byteOff)
}
records = append(records, extInfoRecord{uint64(byteOff), buf})
}
result[secName] = extInfo{
recordSize,
records,
}
}
}
// bpfCoreRelo matches `struct bpf_core_relo` from the kernel
type bpfCoreRelo struct {
InsnOff uint32
TypeID TypeID
AccessStrOff uint32
Kind COREKind
}
type coreRelo struct {
insnOff uint32
typeID TypeID
accessor coreAccessor
kind COREKind
}
type coreRelos []coreRelo
// append two slices of extInfoRelo to each other. The InsnOff of b are adjusted
// by offset.
func (r coreRelos) append(other coreRelos, offset uint64) coreRelos {
result := make([]coreRelo, 0, len(r)+len(other))
result = append(result, r...)
for _, relo := range other {
relo.insnOff += uint32(offset)
result = append(result, relo)
}
return result
}
var extInfoReloSize = binary.Size(bpfCoreRelo{})
func parseExtInfoRelos(r io.Reader, bo binary.ByteOrder, strings stringTable) (map[string]coreRelos, error) {
var recordSize uint32
if err := binary.Read(r, bo, &recordSize); err != nil {
return nil, fmt.Errorf("read record size: %v", err)
}
if recordSize != uint32(extInfoReloSize) {
return nil, fmt.Errorf("expected record size %d, got %d", extInfoReloSize, recordSize)
}
result := make(map[string]coreRelos)
for {
secName, infoHeader, err := parseExtInfoHeader(r, bo, strings)
if errors.Is(err, io.EOF) {
return result, nil
}
var relos coreRelos
for i := uint32(0); i < infoHeader.NumInfo; i++ {
var relo bpfCoreRelo
if err := binary.Read(r, bo, &relo); err != nil {
return nil, fmt.Errorf("section %v: read record: %v", secName, err)
}
if relo.InsnOff%asm.InstructionSize != 0 {
return nil, fmt.Errorf("section %v: offset %v is not aligned with instruction size", secName, relo.InsnOff)
}
accessorStr, err := strings.Lookup(relo.AccessStrOff)
if err != nil {
return nil, err
}
accessor, err := parseCoreAccessor(accessorStr)
if err != nil {
return nil, fmt.Errorf("accessor %q: %s", accessorStr, err)
}
relos = append(relos, coreRelo{
relo.InsnOff,
relo.TypeID,
accessor,
relo.Kind,
})
}
result[secName] = relos
}
}
func parseExtInfoHeader(r io.Reader, bo binary.ByteOrder, strings stringTable) (string, *btfExtInfoSec, error) {
var infoHeader btfExtInfoSec
if err := binary.Read(r, bo, &infoHeader); err != nil {
return "", nil, fmt.Errorf("read ext info header: %w", err)
}
secName, err := strings.Lookup(infoHeader.SecNameOff)
if err != nil {
return "", nil, fmt.Errorf("get section name: %w", err)
}
if infoHeader.NumInfo == 0 {
return "", nil, fmt.Errorf("section %s has zero records", secName)
}
return secName, &infoHeader, nil
}

View File

@ -0,0 +1,49 @@
// +build gofuzz
// Use with https://github.com/dvyukov/go-fuzz
package btf
import (
"bytes"
"encoding/binary"
"github.com/cilium/ebpf/internal"
)
func FuzzSpec(data []byte) int {
if len(data) < binary.Size(btfHeader{}) {
return -1
}
spec, err := loadNakedSpec(bytes.NewReader(data), internal.NativeEndian, nil, nil)
if err != nil {
if spec != nil {
panic("spec is not nil")
}
return 0
}
if spec == nil {
panic("spec is nil")
}
return 1
}
func FuzzExtInfo(data []byte) int {
if len(data) < binary.Size(btfExtHeader{}) {
return -1
}
table := stringTable("\x00foo\x00barfoo\x00")
info, err := parseExtInfo(bytes.NewReader(data), internal.NativeEndian, table)
if err != nil {
if info != nil {
panic("info is not nil")
}
return 0
}
if info == nil {
panic("info is nil")
}
return 1
}

View File

@ -0,0 +1,60 @@
package btf
import (
"bytes"
"errors"
"fmt"
"io"
"io/ioutil"
)
type stringTable []byte
func readStringTable(r io.Reader) (stringTable, error) {
contents, err := ioutil.ReadAll(r)
if err != nil {
return nil, fmt.Errorf("can't read string table: %v", err)
}
if len(contents) < 1 {
return nil, errors.New("string table is empty")
}
if contents[0] != '\x00' {
return nil, errors.New("first item in string table is non-empty")
}
if contents[len(contents)-1] != '\x00' {
return nil, errors.New("string table isn't null terminated")
}
return stringTable(contents), nil
}
func (st stringTable) Lookup(offset uint32) (string, error) {
if int64(offset) > int64(^uint(0)>>1) {
return "", fmt.Errorf("offset %d overflows int", offset)
}
pos := int(offset)
if pos >= len(st) {
return "", fmt.Errorf("offset %d is out of bounds", offset)
}
if pos > 0 && st[pos-1] != '\x00' {
return "", fmt.Errorf("offset %d isn't start of a string", offset)
}
str := st[pos:]
end := bytes.IndexByte(str, '\x00')
if end == -1 {
return "", fmt.Errorf("offset %d isn't null terminated", offset)
}
return string(str[:end]), nil
}
func (st stringTable) LookupName(offset uint32) (Name, error) {
str, err := st.Lookup(offset)
return Name(str), err
}

View File

@ -0,0 +1,893 @@
package btf
import (
"fmt"
"math"
"strings"
)
const maxTypeDepth = 32
// TypeID identifies a type in a BTF section.
type TypeID uint32
// ID implements part of the Type interface.
func (tid TypeID) ID() TypeID {
return tid
}
// Type represents a type described by BTF.
type Type interface {
ID() TypeID
String() string
// Make a copy of the type, without copying Type members.
copy() Type
// Enumerate all nested Types. Repeated calls must visit nested
// types in the same order.
walk(*typeDeque)
}
// namedType is a type with a name.
//
// Most named types simply embed Name.
type namedType interface {
Type
name() string
essentialName() string
}
// Name identifies a type.
//
// Anonymous types have an empty name.
type Name string
func (n Name) name() string {
return string(n)
}
func (n Name) essentialName() string {
return essentialName(string(n))
}
// Void is the unit type of BTF.
type Void struct{}
func (v *Void) ID() TypeID { return 0 }
func (v *Void) String() string { return "void#0" }
func (v *Void) size() uint32 { return 0 }
func (v *Void) copy() Type { return (*Void)(nil) }
func (v *Void) walk(*typeDeque) {}
type IntEncoding byte
const (
Signed IntEncoding = 1 << iota
Char
Bool
)
// Int is an integer of a given length.
type Int struct {
TypeID
Name
// The size of the integer in bytes.
Size uint32
Encoding IntEncoding
// Offset is the starting bit offset. Currently always 0.
// See https://www.kernel.org/doc/html/latest/bpf/btf.html#btf-kind-int
Offset uint32
Bits byte
}
var _ namedType = (*Int)(nil)
func (i *Int) String() string {
var s strings.Builder
switch {
case i.Encoding&Char != 0:
s.WriteString("char")
case i.Encoding&Bool != 0:
s.WriteString("bool")
default:
if i.Encoding&Signed == 0 {
s.WriteRune('u')
}
s.WriteString("int")
fmt.Fprintf(&s, "%d", i.Size*8)
}
fmt.Fprintf(&s, "#%d", i.TypeID)
if i.Bits > 0 {
fmt.Fprintf(&s, "[bits=%d]", i.Bits)
}
return s.String()
}
func (i *Int) size() uint32 { return i.Size }
func (i *Int) walk(*typeDeque) {}
func (i *Int) copy() Type {
cpy := *i
return &cpy
}
func (i *Int) isBitfield() bool {
return i.Offset > 0
}
// Pointer is a pointer to another type.
type Pointer struct {
TypeID
Target Type
}
func (p *Pointer) String() string {
return fmt.Sprintf("pointer#%d[target=#%d]", p.TypeID, p.Target.ID())
}
func (p *Pointer) size() uint32 { return 8 }
func (p *Pointer) walk(tdq *typeDeque) { tdq.push(&p.Target) }
func (p *Pointer) copy() Type {
cpy := *p
return &cpy
}
// Array is an array with a fixed number of elements.
type Array struct {
TypeID
Type Type
Nelems uint32
}
func (arr *Array) String() string {
return fmt.Sprintf("array#%d[type=#%d n=%d]", arr.TypeID, arr.Type.ID(), arr.Nelems)
}
func (arr *Array) walk(tdq *typeDeque) { tdq.push(&arr.Type) }
func (arr *Array) copy() Type {
cpy := *arr
return &cpy
}
// Struct is a compound type of consecutive members.
type Struct struct {
TypeID
Name
// The size of the struct including padding, in bytes
Size uint32
Members []Member
}
func (s *Struct) String() string {
return fmt.Sprintf("struct#%d[%q]", s.TypeID, s.Name)
}
func (s *Struct) size() uint32 { return s.Size }
func (s *Struct) walk(tdq *typeDeque) {
for i := range s.Members {
tdq.push(&s.Members[i].Type)
}
}
func (s *Struct) copy() Type {
cpy := *s
cpy.Members = copyMembers(s.Members)
return &cpy
}
func (s *Struct) members() []Member {
return s.Members
}
// Union is a compound type where members occupy the same memory.
type Union struct {
TypeID
Name
// The size of the union including padding, in bytes.
Size uint32
Members []Member
}
func (u *Union) String() string {
return fmt.Sprintf("union#%d[%q]", u.TypeID, u.Name)
}
func (u *Union) size() uint32 { return u.Size }
func (u *Union) walk(tdq *typeDeque) {
for i := range u.Members {
tdq.push(&u.Members[i].Type)
}
}
func (u *Union) copy() Type {
cpy := *u
cpy.Members = copyMembers(u.Members)
return &cpy
}
func (u *Union) members() []Member {
return u.Members
}
func copyMembers(orig []Member) []Member {
cpy := make([]Member, len(orig))
copy(cpy, orig)
return cpy
}
type composite interface {
members() []Member
}
var (
_ composite = (*Struct)(nil)
_ composite = (*Union)(nil)
)
// Member is part of a Struct or Union.
//
// It is not a valid Type.
type Member struct {
Name
Type Type
// Offset is the bit offset of this member
Offset uint32
BitfieldSize uint32
}
// Enum lists possible values.
type Enum struct {
TypeID
Name
Values []EnumValue
}
func (e *Enum) String() string {
return fmt.Sprintf("enum#%d[%q]", e.TypeID, e.Name)
}
// EnumValue is part of an Enum
//
// Is is not a valid Type
type EnumValue struct {
Name
Value int32
}
func (e *Enum) size() uint32 { return 4 }
func (e *Enum) walk(*typeDeque) {}
func (e *Enum) copy() Type {
cpy := *e
cpy.Values = make([]EnumValue, len(e.Values))
copy(cpy.Values, e.Values)
return &cpy
}
// FwdKind is the type of forward declaration.
type FwdKind int
// Valid types of forward declaration.
const (
FwdStruct FwdKind = iota
FwdUnion
)
func (fk FwdKind) String() string {
switch fk {
case FwdStruct:
return "struct"
case FwdUnion:
return "union"
default:
return fmt.Sprintf("%T(%d)", fk, int(fk))
}
}
// Fwd is a forward declaration of a Type.
type Fwd struct {
TypeID
Name
Kind FwdKind
}
func (f *Fwd) String() string {
return fmt.Sprintf("fwd#%d[%s %q]", f.TypeID, f.Kind, f.Name)
}
func (f *Fwd) walk(*typeDeque) {}
func (f *Fwd) copy() Type {
cpy := *f
return &cpy
}
// Typedef is an alias of a Type.
type Typedef struct {
TypeID
Name
Type Type
}
func (td *Typedef) String() string {
return fmt.Sprintf("typedef#%d[%q #%d]", td.TypeID, td.Name, td.Type.ID())
}
func (td *Typedef) walk(tdq *typeDeque) { tdq.push(&td.Type) }
func (td *Typedef) copy() Type {
cpy := *td
return &cpy
}
// Volatile is a qualifier.
type Volatile struct {
TypeID
Type Type
}
func (v *Volatile) String() string {
return fmt.Sprintf("volatile#%d[#%d]", v.TypeID, v.Type.ID())
}
func (v *Volatile) qualify() Type { return v.Type }
func (v *Volatile) walk(tdq *typeDeque) { tdq.push(&v.Type) }
func (v *Volatile) copy() Type {
cpy := *v
return &cpy
}
// Const is a qualifier.
type Const struct {
TypeID
Type Type
}
func (c *Const) String() string {
return fmt.Sprintf("const#%d[#%d]", c.TypeID, c.Type.ID())
}
func (c *Const) qualify() Type { return c.Type }
func (c *Const) walk(tdq *typeDeque) { tdq.push(&c.Type) }
func (c *Const) copy() Type {
cpy := *c
return &cpy
}
// Restrict is a qualifier.
type Restrict struct {
TypeID
Type Type
}
func (r *Restrict) String() string {
return fmt.Sprintf("restrict#%d[#%d]", r.TypeID, r.Type.ID())
}
func (r *Restrict) qualify() Type { return r.Type }
func (r *Restrict) walk(tdq *typeDeque) { tdq.push(&r.Type) }
func (r *Restrict) copy() Type {
cpy := *r
return &cpy
}
// Func is a function definition.
type Func struct {
TypeID
Name
Type Type
Linkage FuncLinkage
}
func (f *Func) String() string {
return fmt.Sprintf("func#%d[%s %q proto=#%d]", f.TypeID, f.Linkage, f.Name, f.Type.ID())
}
func (f *Func) walk(tdq *typeDeque) { tdq.push(&f.Type) }
func (f *Func) copy() Type {
cpy := *f
return &cpy
}
// FuncProto is a function declaration.
type FuncProto struct {
TypeID
Return Type
Params []FuncParam
}
func (fp *FuncProto) String() string {
var s strings.Builder
fmt.Fprintf(&s, "proto#%d[", fp.TypeID)
for _, param := range fp.Params {
fmt.Fprintf(&s, "%q=#%d, ", param.Name, param.Type.ID())
}
fmt.Fprintf(&s, "return=#%d]", fp.Return.ID())
return s.String()
}
func (fp *FuncProto) walk(tdq *typeDeque) {
tdq.push(&fp.Return)
for i := range fp.Params {
tdq.push(&fp.Params[i].Type)
}
}
func (fp *FuncProto) copy() Type {
cpy := *fp
cpy.Params = make([]FuncParam, len(fp.Params))
copy(cpy.Params, fp.Params)
return &cpy
}
type FuncParam struct {
Name
Type Type
}
// Var is a global variable.
type Var struct {
TypeID
Name
Type Type
Linkage VarLinkage
}
func (v *Var) String() string {
return fmt.Sprintf("var#%d[%s %q]", v.TypeID, v.Linkage, v.Name)
}
func (v *Var) walk(tdq *typeDeque) { tdq.push(&v.Type) }
func (v *Var) copy() Type {
cpy := *v
return &cpy
}
// Datasec is a global program section containing data.
type Datasec struct {
TypeID
Name
Size uint32
Vars []VarSecinfo
}
func (ds *Datasec) String() string {
return fmt.Sprintf("section#%d[%q]", ds.TypeID, ds.Name)
}
func (ds *Datasec) size() uint32 { return ds.Size }
func (ds *Datasec) walk(tdq *typeDeque) {
for i := range ds.Vars {
tdq.push(&ds.Vars[i].Type)
}
}
func (ds *Datasec) copy() Type {
cpy := *ds
cpy.Vars = make([]VarSecinfo, len(ds.Vars))
copy(cpy.Vars, ds.Vars)
return &cpy
}
// VarSecinfo describes variable in a Datasec
//
// It is not a valid Type.
type VarSecinfo struct {
Type Type
Offset uint32
Size uint32
}
type sizer interface {
size() uint32
}
var (
_ sizer = (*Int)(nil)
_ sizer = (*Pointer)(nil)
_ sizer = (*Struct)(nil)
_ sizer = (*Union)(nil)
_ sizer = (*Enum)(nil)
_ sizer = (*Datasec)(nil)
)
type qualifier interface {
qualify() Type
}
var (
_ qualifier = (*Const)(nil)
_ qualifier = (*Restrict)(nil)
_ qualifier = (*Volatile)(nil)
)
// Sizeof returns the size of a type in bytes.
//
// Returns an error if the size can't be computed.
func Sizeof(typ Type) (int, error) {
var (
n = int64(1)
elem int64
)
for i := 0; i < maxTypeDepth; i++ {
switch v := typ.(type) {
case *Array:
if n > 0 && int64(v.Nelems) > math.MaxInt64/n {
return 0, fmt.Errorf("type %s: overflow", typ)
}
// Arrays may be of zero length, which allows
// n to be zero as well.
n *= int64(v.Nelems)
typ = v.Type
continue
case sizer:
elem = int64(v.size())
case *Typedef:
typ = v.Type
continue
case qualifier:
typ = v.qualify()
continue
default:
return 0, fmt.Errorf("unsized type %T", typ)
}
if n > 0 && elem > math.MaxInt64/n {
return 0, fmt.Errorf("type %s: overflow", typ)
}
size := n * elem
if int64(int(size)) != size {
return 0, fmt.Errorf("type %s: overflow", typ)
}
return int(size), nil
}
return 0, fmt.Errorf("type %s: exceeded type depth", typ)
}
// copy a Type recursively.
//
// typ may form a cycle.
//
// Returns any errors from transform verbatim.
func copyType(typ Type, transform func(Type) (Type, error)) (Type, error) {
var (
copies = make(map[Type]Type)
work typeDeque
)
for t := &typ; t != nil; t = work.pop() {
// *t is the identity of the type.
if cpy := copies[*t]; cpy != nil {
*t = cpy
continue
}
var cpy Type
if transform != nil {
tf, err := transform(*t)
if err != nil {
return nil, fmt.Errorf("copy %s: %w", typ, err)
}
cpy = tf.copy()
} else {
cpy = (*t).copy()
}
copies[*t] = cpy
*t = cpy
// Mark any nested types for copying.
cpy.walk(&work)
}
return typ, nil
}
// typeDeque keeps track of pointers to types which still
// need to be visited.
type typeDeque struct {
types []*Type
read, write uint64
mask uint64
}
// push adds a type to the stack.
func (dq *typeDeque) push(t *Type) {
if dq.write-dq.read < uint64(len(dq.types)) {
dq.types[dq.write&dq.mask] = t
dq.write++
return
}
new := len(dq.types) * 2
if new == 0 {
new = 8
}
types := make([]*Type, new)
pivot := dq.read & dq.mask
n := copy(types, dq.types[pivot:])
n += copy(types[n:], dq.types[:pivot])
types[n] = t
dq.types = types
dq.mask = uint64(new) - 1
dq.read, dq.write = 0, uint64(n+1)
}
// shift returns the first element or null.
func (dq *typeDeque) shift() *Type {
if dq.read == dq.write {
return nil
}
index := dq.read & dq.mask
t := dq.types[index]
dq.types[index] = nil
dq.read++
return t
}
// pop returns the last element or null.
func (dq *typeDeque) pop() *Type {
if dq.read == dq.write {
return nil
}
dq.write--
index := dq.write & dq.mask
t := dq.types[index]
dq.types[index] = nil
return t
}
// all returns all elements.
//
// The deque is empty after calling this method.
func (dq *typeDeque) all() []*Type {
length := dq.write - dq.read
types := make([]*Type, 0, length)
for t := dq.shift(); t != nil; t = dq.shift() {
types = append(types, t)
}
return types
}
// inflateRawTypes takes a list of raw btf types linked via type IDs, and turns
// it into a graph of Types connected via pointers.
//
// Returns a map of named types (so, where NameOff is non-zero) and a slice of types
// indexed by TypeID. Since BTF ignores compilation units, multiple types may share
// the same name. A Type may form a cyclic graph by pointing at itself.
func inflateRawTypes(rawTypes []rawType, rawStrings stringTable) (types []Type, namedTypes map[string][]namedType, err error) {
type fixupDef struct {
id TypeID
expectedKind btfKind
typ *Type
}
var fixups []fixupDef
fixup := func(id TypeID, expectedKind btfKind, typ *Type) {
fixups = append(fixups, fixupDef{id, expectedKind, typ})
}
convertMembers := func(raw []btfMember, kindFlag bool) ([]Member, error) {
// NB: The fixup below relies on pre-allocating this array to
// work, since otherwise append might re-allocate members.
members := make([]Member, 0, len(raw))
for i, btfMember := range raw {
name, err := rawStrings.LookupName(btfMember.NameOff)
if err != nil {
return nil, fmt.Errorf("can't get name for member %d: %w", i, err)
}
m := Member{
Name: name,
Offset: btfMember.Offset,
}
if kindFlag {
m.BitfieldSize = btfMember.Offset >> 24
m.Offset &= 0xffffff
}
members = append(members, m)
}
for i := range members {
fixup(raw[i].Type, kindUnknown, &members[i].Type)
}
return members, nil
}
types = make([]Type, 0, len(rawTypes))
types = append(types, (*Void)(nil))
namedTypes = make(map[string][]namedType)
for i, raw := range rawTypes {
var (
// Void is defined to always be type ID 0, and is thus
// omitted from BTF.
id = TypeID(i + 1)
typ Type
)
name, err := rawStrings.LookupName(raw.NameOff)
if err != nil {
return nil, nil, fmt.Errorf("get name for type id %d: %w", id, err)
}
switch raw.Kind() {
case kindInt:
encoding, offset, bits := intEncoding(*raw.data.(*uint32))
typ = &Int{id, name, raw.Size(), encoding, offset, bits}
case kindPointer:
ptr := &Pointer{id, nil}
fixup(raw.Type(), kindUnknown, &ptr.Target)
typ = ptr
case kindArray:
btfArr := raw.data.(*btfArray)
// IndexType is unused according to btf.rst.
// Don't make it available right now.
arr := &Array{id, nil, btfArr.Nelems}
fixup(btfArr.Type, kindUnknown, &arr.Type)
typ = arr
case kindStruct:
members, err := convertMembers(raw.data.([]btfMember), raw.KindFlag())
if err != nil {
return nil, nil, fmt.Errorf("struct %s (id %d): %w", name, id, err)
}
typ = &Struct{id, name, raw.Size(), members}
case kindUnion:
members, err := convertMembers(raw.data.([]btfMember), raw.KindFlag())
if err != nil {
return nil, nil, fmt.Errorf("union %s (id %d): %w", name, id, err)
}
typ = &Union{id, name, raw.Size(), members}
case kindEnum:
rawvals := raw.data.([]btfEnum)
vals := make([]EnumValue, 0, len(rawvals))
for i, btfVal := range rawvals {
name, err := rawStrings.LookupName(btfVal.NameOff)
if err != nil {
return nil, nil, fmt.Errorf("get name for enum value %d: %s", i, err)
}
vals = append(vals, EnumValue{
Name: name,
Value: btfVal.Val,
})
}
typ = &Enum{id, name, vals}
case kindForward:
if raw.KindFlag() {
typ = &Fwd{id, name, FwdUnion}
} else {
typ = &Fwd{id, name, FwdStruct}
}
case kindTypedef:
typedef := &Typedef{id, name, nil}
fixup(raw.Type(), kindUnknown, &typedef.Type)
typ = typedef
case kindVolatile:
volatile := &Volatile{id, nil}
fixup(raw.Type(), kindUnknown, &volatile.Type)
typ = volatile
case kindConst:
cnst := &Const{id, nil}
fixup(raw.Type(), kindUnknown, &cnst.Type)
typ = cnst
case kindRestrict:
restrict := &Restrict{id, nil}
fixup(raw.Type(), kindUnknown, &restrict.Type)
typ = restrict
case kindFunc:
fn := &Func{id, name, nil, raw.Linkage()}
fixup(raw.Type(), kindFuncProto, &fn.Type)
typ = fn
case kindFuncProto:
rawparams := raw.data.([]btfParam)
params := make([]FuncParam, 0, len(rawparams))
for i, param := range rawparams {
name, err := rawStrings.LookupName(param.NameOff)
if err != nil {
return nil, nil, fmt.Errorf("get name for func proto parameter %d: %s", i, err)
}
params = append(params, FuncParam{
Name: name,
})
}
for i := range params {
fixup(rawparams[i].Type, kindUnknown, &params[i].Type)
}
fp := &FuncProto{id, nil, params}
fixup(raw.Type(), kindUnknown, &fp.Return)
typ = fp
case kindVar:
variable := raw.data.(*btfVariable)
v := &Var{id, name, nil, VarLinkage(variable.Linkage)}
fixup(raw.Type(), kindUnknown, &v.Type)
typ = v
case kindDatasec:
btfVars := raw.data.([]btfVarSecinfo)
vars := make([]VarSecinfo, 0, len(btfVars))
for _, btfVar := range btfVars {
vars = append(vars, VarSecinfo{
Offset: btfVar.Offset,
Size: btfVar.Size,
})
}
for i := range vars {
fixup(btfVars[i].Type, kindVar, &vars[i].Type)
}
typ = &Datasec{id, name, raw.SizeType, vars}
default:
return nil, nil, fmt.Errorf("type id %d: unknown kind: %v", id, raw.Kind())
}
types = append(types, typ)
if named, ok := typ.(namedType); ok {
if name := essentialName(named.name()); name != "" {
namedTypes[name] = append(namedTypes[name], named)
}
}
}
for _, fixup := range fixups {
i := int(fixup.id)
if i >= len(types) {
return nil, nil, fmt.Errorf("reference to invalid type id: %d", fixup.id)
}
// Default void (id 0) to unknown
rawKind := kindUnknown
if i > 0 {
rawKind = rawTypes[i-1].Kind()
}
if expected := fixup.expectedKind; expected != kindUnknown && rawKind != expected {
return nil, nil, fmt.Errorf("expected type id %d to have kind %s, found %s", fixup.id, expected, rawKind)
}
*fixup.typ = types[i]
}
return types, namedTypes, nil
}
// essentialName returns name without a ___ suffix.
func essentialName(name string) string {
lastIdx := strings.LastIndex(name, "___")
if lastIdx > 0 {
return name[:lastIdx]
}
return name
}

View File

@ -0,0 +1,62 @@
package internal
import (
"fmt"
"io/ioutil"
"strings"
"sync"
)
var sysCPU struct {
once sync.Once
err error
num int
}
// PossibleCPUs returns the max number of CPUs a system may possibly have
// Logical CPU numbers must be of the form 0-n
func PossibleCPUs() (int, error) {
sysCPU.once.Do(func() {
sysCPU.num, sysCPU.err = parseCPUsFromFile("/sys/devices/system/cpu/possible")
})
return sysCPU.num, sysCPU.err
}
func parseCPUsFromFile(path string) (int, error) {
spec, err := ioutil.ReadFile(path)
if err != nil {
return 0, err
}
n, err := parseCPUs(string(spec))
if err != nil {
return 0, fmt.Errorf("can't parse %s: %v", path, err)
}
return n, nil
}
// parseCPUs parses the number of cpus from a string produced
// by bitmap_list_string() in the Linux kernel.
// Multiple ranges are rejected, since they can't be unified
// into a single number.
// This is the format of /sys/devices/system/cpu/possible, it
// is not suitable for /sys/devices/system/cpu/online, etc.
func parseCPUs(spec string) (int, error) {
if strings.Trim(spec, "\n") == "0" {
return 1, nil
}
var low, high int
n, err := fmt.Sscanf(spec, "%d-%d\n", &low, &high)
if n != 2 || err != nil {
return 0, fmt.Errorf("invalid format: %s", spec)
}
if low != 0 {
return 0, fmt.Errorf("CPU spec doesn't start at zero: %s", spec)
}
// cpus is 0 indexed
return high + 1, nil
}

View File

@ -0,0 +1,68 @@
package internal
import (
"debug/elf"
"fmt"
"io"
)
type SafeELFFile struct {
*elf.File
}
// NewSafeELFFile reads an ELF safely.
//
// Any panic during parsing is turned into an error. This is necessary since
// there are a bunch of unfixed bugs in debug/elf.
//
// https://github.com/golang/go/issues?q=is%3Aissue+is%3Aopen+debug%2Felf+in%3Atitle
func NewSafeELFFile(r io.ReaderAt) (safe *SafeELFFile, err error) {
defer func() {
r := recover()
if r == nil {
return
}
safe = nil
err = fmt.Errorf("reading ELF file panicked: %s", r)
}()
file, err := elf.NewFile(r)
if err != nil {
return nil, err
}
return &SafeELFFile{file}, nil
}
// Symbols is the safe version of elf.File.Symbols.
func (se *SafeELFFile) Symbols() (syms []elf.Symbol, err error) {
defer func() {
r := recover()
if r == nil {
return
}
syms = nil
err = fmt.Errorf("reading ELF symbols panicked: %s", r)
}()
syms, err = se.File.Symbols()
return
}
// DynamicSymbols is the safe version of elf.File.DynamicSymbols.
func (se *SafeELFFile) DynamicSymbols() (syms []elf.Symbol, err error) {
defer func() {
r := recover()
if r == nil {
return
}
syms = nil
err = fmt.Errorf("reading ELF dynamic symbols panicked: %s", r)
}()
syms, err = se.File.DynamicSymbols()
return
}

View File

@ -0,0 +1,29 @@
package internal
import (
"encoding/binary"
"unsafe"
)
// NativeEndian is set to either binary.BigEndian or binary.LittleEndian,
// depending on the host's endianness.
var NativeEndian binary.ByteOrder
// Clang is set to either "el" or "eb" depending on the host's endianness.
var ClangEndian string
func init() {
if isBigEndian() {
NativeEndian = binary.BigEndian
ClangEndian = "eb"
} else {
NativeEndian = binary.LittleEndian
ClangEndian = "el"
}
}
func isBigEndian() (ret bool) {
i := int(0x1)
bs := (*[int(unsafe.Sizeof(i))]byte)(unsafe.Pointer(&i))
return bs[0] == 0
}

View File

@ -0,0 +1,51 @@
package internal
import (
"bytes"
"errors"
"fmt"
"strings"
"github.com/cilium/ebpf/internal/unix"
)
// ErrorWithLog returns an error that includes logs from the
// kernel verifier.
//
// logErr should be the error returned by the syscall that generated
// the log. It is used to check for truncation of the output.
func ErrorWithLog(err error, log []byte, logErr error) error {
logStr := strings.Trim(CString(log), "\t\r\n ")
if errors.Is(logErr, unix.ENOSPC) {
logStr += " (truncated...)"
}
return &VerifierError{err, logStr}
}
// VerifierError includes information from the eBPF verifier.
type VerifierError struct {
cause error
log string
}
func (le *VerifierError) Unwrap() error {
return le.cause
}
func (le *VerifierError) Error() string {
if le.log == "" {
return le.cause.Error()
}
return fmt.Sprintf("%s: %s", le.cause, le.log)
}
// CString turns a NUL / zero terminated byte buffer into a string.
func CString(in []byte) string {
inLen := bytes.IndexByte(in, 0)
if inLen == -1 {
return ""
}
return string(in[:inLen])
}

View File

@ -0,0 +1,69 @@
package internal
import (
"errors"
"fmt"
"os"
"runtime"
"strconv"
"github.com/cilium/ebpf/internal/unix"
)
var ErrClosedFd = errors.New("use of closed file descriptor")
type FD struct {
raw int64
}
func NewFD(value uint32) *FD {
fd := &FD{int64(value)}
runtime.SetFinalizer(fd, (*FD).Close)
return fd
}
func (fd *FD) String() string {
return strconv.FormatInt(fd.raw, 10)
}
func (fd *FD) Value() (uint32, error) {
if fd.raw < 0 {
return 0, ErrClosedFd
}
return uint32(fd.raw), nil
}
func (fd *FD) Close() error {
if fd.raw < 0 {
return nil
}
value := int(fd.raw)
fd.raw = -1
fd.Forget()
return unix.Close(value)
}
func (fd *FD) Forget() {
runtime.SetFinalizer(fd, nil)
}
func (fd *FD) Dup() (*FD, error) {
if fd.raw < 0 {
return nil, ErrClosedFd
}
dup, err := unix.FcntlInt(uintptr(fd.raw), unix.F_DUPFD_CLOEXEC, 0)
if err != nil {
return nil, fmt.Errorf("can't dup fd: %v", err)
}
return NewFD(uint32(dup)), nil
}
func (fd *FD) File(name string) *os.File {
fd.Forget()
return os.NewFile(uintptr(fd.raw), name)
}

View File

@ -0,0 +1,100 @@
package internal
import (
"errors"
"fmt"
"sync"
)
// ErrNotSupported indicates that a feature is not supported by the current kernel.
var ErrNotSupported = errors.New("not supported")
// UnsupportedFeatureError is returned by FeatureTest() functions.
type UnsupportedFeatureError struct {
// The minimum Linux mainline version required for this feature.
// Used for the error string, and for sanity checking during testing.
MinimumVersion Version
// The name of the feature that isn't supported.
Name string
}
func (ufe *UnsupportedFeatureError) Error() string {
if ufe.MinimumVersion.Unspecified() {
return fmt.Sprintf("%s not supported", ufe.Name)
}
return fmt.Sprintf("%s not supported (requires >= %s)", ufe.Name, ufe.MinimumVersion)
}
// Is indicates that UnsupportedFeatureError is ErrNotSupported.
func (ufe *UnsupportedFeatureError) Is(target error) bool {
return target == ErrNotSupported
}
type featureTest struct {
sync.RWMutex
successful bool
result error
}
// FeatureTestFn is used to determine whether the kernel supports
// a certain feature.
//
// The return values have the following semantics:
//
// err == ErrNotSupported: the feature is not available
// err == nil: the feature is available
// err != nil: the test couldn't be executed
type FeatureTestFn func() error
// FeatureTest wraps a function so that it is run at most once.
//
// name should identify the tested feature, while version must be in the
// form Major.Minor[.Patch].
//
// Returns an error wrapping ErrNotSupported if the feature is not supported.
func FeatureTest(name, version string, fn FeatureTestFn) func() error {
v, err := NewVersion(version)
if err != nil {
return func() error { return err }
}
ft := new(featureTest)
return func() error {
ft.RLock()
if ft.successful {
defer ft.RUnlock()
return ft.result
}
ft.RUnlock()
ft.Lock()
defer ft.Unlock()
// check one more time on the off
// chance that two go routines
// were able to call into the write
// lock
if ft.successful {
return ft.result
}
err := fn()
switch {
case errors.Is(err, ErrNotSupported):
ft.result = &UnsupportedFeatureError{
MinimumVersion: v,
Name: name,
}
fallthrough
case err == nil:
ft.successful = true
default:
// We couldn't execute the feature test to a point
// where it could make a determination.
// Don't cache the result, just return it.
return fmt.Errorf("detect support for %s: %w", name, err)
}
return ft.result
}
}

View File

@ -0,0 +1,16 @@
package internal
import "errors"
// DiscardZeroes makes sure that all written bytes are zero
// before discarding them.
type DiscardZeroes struct{}
func (DiscardZeroes) Write(p []byte) (int, error) {
for _, b := range p {
if b != 0 {
return 0, errors.New("encountered non-zero byte")
}
}
return len(p), nil
}

View File

@ -0,0 +1,44 @@
package internal
import (
"errors"
"fmt"
"os"
"github.com/cilium/ebpf/internal/unix"
)
func Pin(currentPath, newPath string, fd *FD) error {
if newPath == "" {
return errors.New("given pinning path cannot be empty")
}
if currentPath == newPath {
return nil
}
if currentPath == "" {
return BPFObjPin(newPath, fd)
}
var err error
// Renameat2 is used instead of os.Rename to disallow the new path replacing
// an existing path.
if err = unix.Renameat2(unix.AT_FDCWD, currentPath, unix.AT_FDCWD, newPath, unix.RENAME_NOREPLACE); err == nil {
// Object is now moved to the new pinning path.
return nil
}
if !os.IsNotExist(err) {
return fmt.Errorf("unable to move pinned object to new path %v: %w", newPath, err)
}
// Internal state not in sync with the file system so let's fix it.
return BPFObjPin(newPath, fd)
}
func Unpin(pinnedPath string) error {
if pinnedPath == "" {
return nil
}
err := os.Remove(pinnedPath)
if err == nil || os.IsNotExist(err) {
return nil
}
return err
}

View File

@ -0,0 +1,31 @@
package internal
import (
"unsafe"
"github.com/cilium/ebpf/internal/unix"
)
// NewPointer creates a 64-bit pointer from an unsafe Pointer.
func NewPointer(ptr unsafe.Pointer) Pointer {
return Pointer{ptr: ptr}
}
// NewSlicePointer creates a 64-bit pointer from a byte slice.
func NewSlicePointer(buf []byte) Pointer {
if len(buf) == 0 {
return Pointer{}
}
return Pointer{ptr: unsafe.Pointer(&buf[0])}
}
// NewStringPointer creates a 64-bit pointer from a string.
func NewStringPointer(str string) Pointer {
p, err := unix.BytePtrFromString(str)
if err != nil {
return Pointer{}
}
return Pointer{ptr: unsafe.Pointer(p)}
}

View File

@ -0,0 +1,14 @@
// +build armbe mips mips64p32
package internal
import (
"unsafe"
)
// Pointer wraps an unsafe.Pointer to be 64bit to
// conform to the syscall specification.
type Pointer struct {
pad uint32
ptr unsafe.Pointer
}

View File

@ -0,0 +1,14 @@
// +build 386 amd64p32 arm mipsle mips64p32le
package internal
import (
"unsafe"
)
// Pointer wraps an unsafe.Pointer to be 64bit to
// conform to the syscall specification.
type Pointer struct {
ptr unsafe.Pointer
pad uint32
}

View File

@ -0,0 +1,14 @@
// +build !386,!amd64p32,!arm,!mipsle,!mips64p32le
// +build !armbe,!mips,!mips64p32
package internal
import (
"unsafe"
)
// Pointer wraps an unsafe.Pointer to be 64bit to
// conform to the syscall specification.
type Pointer struct {
ptr unsafe.Pointer
}

View File

@ -0,0 +1,245 @@
package internal
import (
"fmt"
"path/filepath"
"runtime"
"syscall"
"unsafe"
"github.com/cilium/ebpf/internal/unix"
)
//go:generate stringer -output syscall_string.go -type=BPFCmd
// BPFCmd identifies a subcommand of the bpf syscall.
type BPFCmd int
// Well known BPF commands.
const (
BPF_MAP_CREATE BPFCmd = iota
BPF_MAP_LOOKUP_ELEM
BPF_MAP_UPDATE_ELEM
BPF_MAP_DELETE_ELEM
BPF_MAP_GET_NEXT_KEY
BPF_PROG_LOAD
BPF_OBJ_PIN
BPF_OBJ_GET
BPF_PROG_ATTACH
BPF_PROG_DETACH
BPF_PROG_TEST_RUN
BPF_PROG_GET_NEXT_ID
BPF_MAP_GET_NEXT_ID
BPF_PROG_GET_FD_BY_ID
BPF_MAP_GET_FD_BY_ID
BPF_OBJ_GET_INFO_BY_FD
BPF_PROG_QUERY
BPF_RAW_TRACEPOINT_OPEN
BPF_BTF_LOAD
BPF_BTF_GET_FD_BY_ID
BPF_TASK_FD_QUERY
BPF_MAP_LOOKUP_AND_DELETE_ELEM
BPF_MAP_FREEZE
BPF_BTF_GET_NEXT_ID
BPF_MAP_LOOKUP_BATCH
BPF_MAP_LOOKUP_AND_DELETE_BATCH
BPF_MAP_UPDATE_BATCH
BPF_MAP_DELETE_BATCH
BPF_LINK_CREATE
BPF_LINK_UPDATE
BPF_LINK_GET_FD_BY_ID
BPF_LINK_GET_NEXT_ID
BPF_ENABLE_STATS
BPF_ITER_CREATE
)
// BPF wraps SYS_BPF.
//
// Any pointers contained in attr must use the Pointer type from this package.
func BPF(cmd BPFCmd, attr unsafe.Pointer, size uintptr) (uintptr, error) {
r1, _, errNo := unix.Syscall(unix.SYS_BPF, uintptr(cmd), uintptr(attr), size)
runtime.KeepAlive(attr)
var err error
if errNo != 0 {
err = wrappedErrno{errNo}
}
return r1, err
}
type BPFProgAttachAttr struct {
TargetFd uint32
AttachBpfFd uint32
AttachType uint32
AttachFlags uint32
ReplaceBpfFd uint32
}
func BPFProgAttach(attr *BPFProgAttachAttr) error {
_, err := BPF(BPF_PROG_ATTACH, unsafe.Pointer(attr), unsafe.Sizeof(*attr))
return err
}
type BPFProgDetachAttr struct {
TargetFd uint32
AttachBpfFd uint32
AttachType uint32
}
func BPFProgDetach(attr *BPFProgDetachAttr) error {
_, err := BPF(BPF_PROG_DETACH, unsafe.Pointer(attr), unsafe.Sizeof(*attr))
return err
}
type BPFEnableStatsAttr struct {
StatsType uint32
}
func BPFEnableStats(attr *BPFEnableStatsAttr) (*FD, error) {
ptr, err := BPF(BPF_ENABLE_STATS, unsafe.Pointer(attr), unsafe.Sizeof(*attr))
if err != nil {
return nil, fmt.Errorf("enable stats: %w", err)
}
return NewFD(uint32(ptr)), nil
}
type bpfObjAttr struct {
fileName Pointer
fd uint32
fileFlags uint32
}
const bpfFSType = 0xcafe4a11
// BPFObjPin wraps BPF_OBJ_PIN.
func BPFObjPin(fileName string, fd *FD) error {
dirName := filepath.Dir(fileName)
var statfs unix.Statfs_t
if err := unix.Statfs(dirName, &statfs); err != nil {
return err
}
if uint64(statfs.Type) != bpfFSType {
return fmt.Errorf("%s is not on a bpf filesystem", fileName)
}
value, err := fd.Value()
if err != nil {
return err
}
attr := bpfObjAttr{
fileName: NewStringPointer(fileName),
fd: value,
}
_, err = BPF(BPF_OBJ_PIN, unsafe.Pointer(&attr), unsafe.Sizeof(attr))
if err != nil {
return fmt.Errorf("pin object %s: %w", fileName, err)
}
return nil
}
// BPFObjGet wraps BPF_OBJ_GET.
func BPFObjGet(fileName string, flags uint32) (*FD, error) {
attr := bpfObjAttr{
fileName: NewStringPointer(fileName),
fileFlags: flags,
}
ptr, err := BPF(BPF_OBJ_GET, unsafe.Pointer(&attr), unsafe.Sizeof(attr))
if err != nil {
return nil, fmt.Errorf("get object %s: %w", fileName, err)
}
return NewFD(uint32(ptr)), nil
}
type bpfObjGetInfoByFDAttr struct {
fd uint32
infoLen uint32
info Pointer
}
// BPFObjGetInfoByFD wraps BPF_OBJ_GET_INFO_BY_FD.
//
// Available from 4.13.
func BPFObjGetInfoByFD(fd *FD, info unsafe.Pointer, size uintptr) error {
value, err := fd.Value()
if err != nil {
return err
}
attr := bpfObjGetInfoByFDAttr{
fd: value,
infoLen: uint32(size),
info: NewPointer(info),
}
_, err = BPF(BPF_OBJ_GET_INFO_BY_FD, unsafe.Pointer(&attr), unsafe.Sizeof(attr))
if err != nil {
return fmt.Errorf("fd %v: %w", fd, err)
}
return nil
}
// BPFObjName is a null-terminated string made up of
// 'A-Za-z0-9_' characters.
type BPFObjName [unix.BPF_OBJ_NAME_LEN]byte
// NewBPFObjName truncates the result if it is too long.
func NewBPFObjName(name string) BPFObjName {
var result BPFObjName
copy(result[:unix.BPF_OBJ_NAME_LEN-1], name)
return result
}
type BPFMapCreateAttr struct {
MapType uint32
KeySize uint32
ValueSize uint32
MaxEntries uint32
Flags uint32
InnerMapFd uint32 // since 4.12 56f668dfe00d
NumaNode uint32 // since 4.14 96eabe7a40aa
MapName BPFObjName // since 4.15 ad5b177bd73f
MapIfIndex uint32
BTFFd uint32
BTFKeyTypeID uint32
BTFValueTypeID uint32
}
func BPFMapCreate(attr *BPFMapCreateAttr) (*FD, error) {
fd, err := BPF(BPF_MAP_CREATE, unsafe.Pointer(attr), unsafe.Sizeof(*attr))
if err != nil {
return nil, err
}
return NewFD(uint32(fd)), nil
}
// wrappedErrno wraps syscall.Errno to prevent direct comparisons with
// syscall.E* or unix.E* constants.
//
// You should never export an error of this type.
type wrappedErrno struct {
syscall.Errno
}
func (we wrappedErrno) Unwrap() error {
return we.Errno
}
type syscallError struct {
error
errno syscall.Errno
}
func SyscallError(err error, errno syscall.Errno) error {
return &syscallError{err, errno}
}
func (se *syscallError) Is(target error) bool {
return target == se.error
}
func (se *syscallError) Unwrap() error {
return se.errno
}

View File

@ -0,0 +1,56 @@
// Code generated by "stringer -output syscall_string.go -type=BPFCmd"; DO NOT EDIT.
package internal
import "strconv"
func _() {
// An "invalid array index" compiler error signifies that the constant values have changed.
// Re-run the stringer command to generate them again.
var x [1]struct{}
_ = x[BPF_MAP_CREATE-0]
_ = x[BPF_MAP_LOOKUP_ELEM-1]
_ = x[BPF_MAP_UPDATE_ELEM-2]
_ = x[BPF_MAP_DELETE_ELEM-3]
_ = x[BPF_MAP_GET_NEXT_KEY-4]
_ = x[BPF_PROG_LOAD-5]
_ = x[BPF_OBJ_PIN-6]
_ = x[BPF_OBJ_GET-7]
_ = x[BPF_PROG_ATTACH-8]
_ = x[BPF_PROG_DETACH-9]
_ = x[BPF_PROG_TEST_RUN-10]
_ = x[BPF_PROG_GET_NEXT_ID-11]
_ = x[BPF_MAP_GET_NEXT_ID-12]
_ = x[BPF_PROG_GET_FD_BY_ID-13]
_ = x[BPF_MAP_GET_FD_BY_ID-14]
_ = x[BPF_OBJ_GET_INFO_BY_FD-15]
_ = x[BPF_PROG_QUERY-16]
_ = x[BPF_RAW_TRACEPOINT_OPEN-17]
_ = x[BPF_BTF_LOAD-18]
_ = x[BPF_BTF_GET_FD_BY_ID-19]
_ = x[BPF_TASK_FD_QUERY-20]
_ = x[BPF_MAP_LOOKUP_AND_DELETE_ELEM-21]
_ = x[BPF_MAP_FREEZE-22]
_ = x[BPF_BTF_GET_NEXT_ID-23]
_ = x[BPF_MAP_LOOKUP_BATCH-24]
_ = x[BPF_MAP_LOOKUP_AND_DELETE_BATCH-25]
_ = x[BPF_MAP_UPDATE_BATCH-26]
_ = x[BPF_MAP_DELETE_BATCH-27]
_ = x[BPF_LINK_CREATE-28]
_ = x[BPF_LINK_UPDATE-29]
_ = x[BPF_LINK_GET_FD_BY_ID-30]
_ = x[BPF_LINK_GET_NEXT_ID-31]
_ = x[BPF_ENABLE_STATS-32]
_ = x[BPF_ITER_CREATE-33]
}
const _BPFCmd_name = "BPF_MAP_CREATEBPF_MAP_LOOKUP_ELEMBPF_MAP_UPDATE_ELEMBPF_MAP_DELETE_ELEMBPF_MAP_GET_NEXT_KEYBPF_PROG_LOADBPF_OBJ_PINBPF_OBJ_GETBPF_PROG_ATTACHBPF_PROG_DETACHBPF_PROG_TEST_RUNBPF_PROG_GET_NEXT_IDBPF_MAP_GET_NEXT_IDBPF_PROG_GET_FD_BY_IDBPF_MAP_GET_FD_BY_IDBPF_OBJ_GET_INFO_BY_FDBPF_PROG_QUERYBPF_RAW_TRACEPOINT_OPENBPF_BTF_LOADBPF_BTF_GET_FD_BY_IDBPF_TASK_FD_QUERYBPF_MAP_LOOKUP_AND_DELETE_ELEMBPF_MAP_FREEZEBPF_BTF_GET_NEXT_IDBPF_MAP_LOOKUP_BATCHBPF_MAP_LOOKUP_AND_DELETE_BATCHBPF_MAP_UPDATE_BATCHBPF_MAP_DELETE_BATCHBPF_LINK_CREATEBPF_LINK_UPDATEBPF_LINK_GET_FD_BY_IDBPF_LINK_GET_NEXT_IDBPF_ENABLE_STATSBPF_ITER_CREATE"
var _BPFCmd_index = [...]uint16{0, 14, 33, 52, 71, 91, 104, 115, 126, 141, 156, 173, 193, 212, 233, 253, 275, 289, 312, 324, 344, 361, 391, 405, 424, 444, 475, 495, 515, 530, 545, 566, 586, 602, 617}
func (i BPFCmd) String() string {
if i < 0 || i >= BPFCmd(len(_BPFCmd_index)-1) {
return "BPFCmd(" + strconv.FormatInt(int64(i), 10) + ")"
}
return _BPFCmd_name[_BPFCmd_index[i]:_BPFCmd_index[i+1]]
}

View File

@ -0,0 +1,204 @@
// +build linux
package unix
import (
"bytes"
"syscall"
linux "golang.org/x/sys/unix"
)
const (
ENOENT = linux.ENOENT
EEXIST = linux.EEXIST
EAGAIN = linux.EAGAIN
ENOSPC = linux.ENOSPC
EINVAL = linux.EINVAL
EPOLLIN = linux.EPOLLIN
EINTR = linux.EINTR
EPERM = linux.EPERM
ESRCH = linux.ESRCH
ENODEV = linux.ENODEV
// ENOTSUPP is not the same as ENOTSUP or EOPNOTSUP
ENOTSUPP = syscall.Errno(0x20c)
EBADF = linux.EBADF
BPF_F_NO_PREALLOC = linux.BPF_F_NO_PREALLOC
BPF_F_NUMA_NODE = linux.BPF_F_NUMA_NODE
BPF_F_RDONLY = linux.BPF_F_RDONLY
BPF_F_WRONLY = linux.BPF_F_WRONLY
BPF_F_RDONLY_PROG = linux.BPF_F_RDONLY_PROG
BPF_F_WRONLY_PROG = linux.BPF_F_WRONLY_PROG
BPF_F_SLEEPABLE = linux.BPF_F_SLEEPABLE
BPF_F_MMAPABLE = linux.BPF_F_MMAPABLE
BPF_F_INNER_MAP = linux.BPF_F_INNER_MAP
BPF_OBJ_NAME_LEN = linux.BPF_OBJ_NAME_LEN
BPF_TAG_SIZE = linux.BPF_TAG_SIZE
SYS_BPF = linux.SYS_BPF
F_DUPFD_CLOEXEC = linux.F_DUPFD_CLOEXEC
EPOLL_CTL_ADD = linux.EPOLL_CTL_ADD
EPOLL_CLOEXEC = linux.EPOLL_CLOEXEC
O_CLOEXEC = linux.O_CLOEXEC
O_NONBLOCK = linux.O_NONBLOCK
PROT_READ = linux.PROT_READ
PROT_WRITE = linux.PROT_WRITE
MAP_SHARED = linux.MAP_SHARED
PERF_ATTR_SIZE_VER1 = linux.PERF_ATTR_SIZE_VER1
PERF_TYPE_SOFTWARE = linux.PERF_TYPE_SOFTWARE
PERF_TYPE_TRACEPOINT = linux.PERF_TYPE_TRACEPOINT
PERF_COUNT_SW_BPF_OUTPUT = linux.PERF_COUNT_SW_BPF_OUTPUT
PERF_EVENT_IOC_DISABLE = linux.PERF_EVENT_IOC_DISABLE
PERF_EVENT_IOC_ENABLE = linux.PERF_EVENT_IOC_ENABLE
PERF_EVENT_IOC_SET_BPF = linux.PERF_EVENT_IOC_SET_BPF
PerfBitWatermark = linux.PerfBitWatermark
PERF_SAMPLE_RAW = linux.PERF_SAMPLE_RAW
PERF_FLAG_FD_CLOEXEC = linux.PERF_FLAG_FD_CLOEXEC
RLIM_INFINITY = linux.RLIM_INFINITY
RLIMIT_MEMLOCK = linux.RLIMIT_MEMLOCK
BPF_STATS_RUN_TIME = linux.BPF_STATS_RUN_TIME
PERF_RECORD_LOST = linux.PERF_RECORD_LOST
PERF_RECORD_SAMPLE = linux.PERF_RECORD_SAMPLE
AT_FDCWD = linux.AT_FDCWD
RENAME_NOREPLACE = linux.RENAME_NOREPLACE
)
// Statfs_t is a wrapper
type Statfs_t = linux.Statfs_t
// Rlimit is a wrapper
type Rlimit = linux.Rlimit
// Setrlimit is a wrapper
func Setrlimit(resource int, rlim *Rlimit) (err error) {
return linux.Setrlimit(resource, rlim)
}
// Syscall is a wrapper
func Syscall(trap, a1, a2, a3 uintptr) (r1, r2 uintptr, err syscall.Errno) {
return linux.Syscall(trap, a1, a2, a3)
}
// FcntlInt is a wrapper
func FcntlInt(fd uintptr, cmd, arg int) (int, error) {
return linux.FcntlInt(fd, cmd, arg)
}
// IoctlSetInt is a wrapper
func IoctlSetInt(fd int, req uint, value int) error {
return linux.IoctlSetInt(fd, req, value)
}
// Statfs is a wrapper
func Statfs(path string, buf *Statfs_t) (err error) {
return linux.Statfs(path, buf)
}
// Close is a wrapper
func Close(fd int) (err error) {
return linux.Close(fd)
}
// EpollEvent is a wrapper
type EpollEvent = linux.EpollEvent
// EpollWait is a wrapper
func EpollWait(epfd int, events []EpollEvent, msec int) (n int, err error) {
return linux.EpollWait(epfd, events, msec)
}
// EpollCtl is a wrapper
func EpollCtl(epfd int, op int, fd int, event *EpollEvent) (err error) {
return linux.EpollCtl(epfd, op, fd, event)
}
// Eventfd is a wrapper
func Eventfd(initval uint, flags int) (fd int, err error) {
return linux.Eventfd(initval, flags)
}
// Write is a wrapper
func Write(fd int, p []byte) (n int, err error) {
return linux.Write(fd, p)
}
// EpollCreate1 is a wrapper
func EpollCreate1(flag int) (fd int, err error) {
return linux.EpollCreate1(flag)
}
// PerfEventMmapPage is a wrapper
type PerfEventMmapPage linux.PerfEventMmapPage
// SetNonblock is a wrapper
func SetNonblock(fd int, nonblocking bool) (err error) {
return linux.SetNonblock(fd, nonblocking)
}
// Mmap is a wrapper
func Mmap(fd int, offset int64, length int, prot int, flags int) (data []byte, err error) {
return linux.Mmap(fd, offset, length, prot, flags)
}
// Munmap is a wrapper
func Munmap(b []byte) (err error) {
return linux.Munmap(b)
}
// PerfEventAttr is a wrapper
type PerfEventAttr = linux.PerfEventAttr
// PerfEventOpen is a wrapper
func PerfEventOpen(attr *PerfEventAttr, pid int, cpu int, groupFd int, flags int) (fd int, err error) {
return linux.PerfEventOpen(attr, pid, cpu, groupFd, flags)
}
// Utsname is a wrapper
type Utsname = linux.Utsname
// Uname is a wrapper
func Uname(buf *Utsname) (err error) {
return linux.Uname(buf)
}
// Getpid is a wrapper
func Getpid() int {
return linux.Getpid()
}
// Gettid is a wrapper
func Gettid() int {
return linux.Gettid()
}
// Tgkill is a wrapper
func Tgkill(tgid int, tid int, sig syscall.Signal) (err error) {
return linux.Tgkill(tgid, tid, sig)
}
// BytePtrFromString is a wrapper
func BytePtrFromString(s string) (*byte, error) {
return linux.BytePtrFromString(s)
}
// ByteSliceToString is a wrapper
func ByteSliceToString(s []byte) string {
return linux.ByteSliceToString(s)
}
// Renameat2 is a wrapper
func Renameat2(olddirfd int, oldpath string, newdirfd int, newpath string, flags uint) error {
return linux.Renameat2(olddirfd, oldpath, newdirfd, newpath, flags)
}
func KernelRelease() (string, error) {
var uname Utsname
err := Uname(&uname)
if err != nil {
return "", err
}
end := bytes.IndexByte(uname.Release[:], 0)
release := string(uname.Release[:end])
return release, nil
}

View File

@ -0,0 +1,263 @@
// +build !linux
package unix
import (
"fmt"
"runtime"
"syscall"
)
var errNonLinux = fmt.Errorf("unsupported platform %s/%s", runtime.GOOS, runtime.GOARCH)
const (
ENOENT = syscall.ENOENT
EEXIST = syscall.EEXIST
EAGAIN = syscall.EAGAIN
ENOSPC = syscall.ENOSPC
EINVAL = syscall.EINVAL
EINTR = syscall.EINTR
EPERM = syscall.EPERM
ESRCH = syscall.ESRCH
ENODEV = syscall.ENODEV
EBADF = syscall.Errno(0)
// ENOTSUPP is not the same as ENOTSUP or EOPNOTSUP
ENOTSUPP = syscall.Errno(0x20c)
BPF_F_NO_PREALLOC = 0
BPF_F_NUMA_NODE = 0
BPF_F_RDONLY = 0
BPF_F_WRONLY = 0
BPF_F_RDONLY_PROG = 0
BPF_F_WRONLY_PROG = 0
BPF_F_SLEEPABLE = 0
BPF_F_MMAPABLE = 0
BPF_F_INNER_MAP = 0
BPF_OBJ_NAME_LEN = 0x10
BPF_TAG_SIZE = 0x8
SYS_BPF = 321
F_DUPFD_CLOEXEC = 0x406
EPOLLIN = 0x1
EPOLL_CTL_ADD = 0x1
EPOLL_CLOEXEC = 0x80000
O_CLOEXEC = 0x80000
O_NONBLOCK = 0x800
PROT_READ = 0x1
PROT_WRITE = 0x2
MAP_SHARED = 0x1
PERF_ATTR_SIZE_VER1 = 0
PERF_TYPE_SOFTWARE = 0x1
PERF_TYPE_TRACEPOINT = 0
PERF_COUNT_SW_BPF_OUTPUT = 0xa
PERF_EVENT_IOC_DISABLE = 0
PERF_EVENT_IOC_ENABLE = 0
PERF_EVENT_IOC_SET_BPF = 0
PerfBitWatermark = 0x4000
PERF_SAMPLE_RAW = 0x400
PERF_FLAG_FD_CLOEXEC = 0x8
RLIM_INFINITY = 0x7fffffffffffffff
RLIMIT_MEMLOCK = 8
BPF_STATS_RUN_TIME = 0
PERF_RECORD_LOST = 2
PERF_RECORD_SAMPLE = 9
AT_FDCWD = -0x2
RENAME_NOREPLACE = 0x1
)
// Statfs_t is a wrapper
type Statfs_t struct {
Type int64
Bsize int64
Blocks uint64
Bfree uint64
Bavail uint64
Files uint64
Ffree uint64
Fsid [2]int32
Namelen int64
Frsize int64
Flags int64
Spare [4]int64
}
// Rlimit is a wrapper
type Rlimit struct {
Cur uint64
Max uint64
}
// Setrlimit is a wrapper
func Setrlimit(resource int, rlim *Rlimit) (err error) {
return errNonLinux
}
// Syscall is a wrapper
func Syscall(trap, a1, a2, a3 uintptr) (r1, r2 uintptr, err syscall.Errno) {
return 0, 0, syscall.Errno(1)
}
// FcntlInt is a wrapper
func FcntlInt(fd uintptr, cmd, arg int) (int, error) {
return -1, errNonLinux
}
// IoctlSetInt is a wrapper
func IoctlSetInt(fd int, req uint, value int) error {
return errNonLinux
}
// Statfs is a wrapper
func Statfs(path string, buf *Statfs_t) error {
return errNonLinux
}
// Close is a wrapper
func Close(fd int) (err error) {
return errNonLinux
}
// EpollEvent is a wrapper
type EpollEvent struct {
Events uint32
Fd int32
Pad int32
}
// EpollWait is a wrapper
func EpollWait(epfd int, events []EpollEvent, msec int) (n int, err error) {
return 0, errNonLinux
}
// EpollCtl is a wrapper
func EpollCtl(epfd int, op int, fd int, event *EpollEvent) (err error) {
return errNonLinux
}
// Eventfd is a wrapper
func Eventfd(initval uint, flags int) (fd int, err error) {
return 0, errNonLinux
}
// Write is a wrapper
func Write(fd int, p []byte) (n int, err error) {
return 0, errNonLinux
}
// EpollCreate1 is a wrapper
func EpollCreate1(flag int) (fd int, err error) {
return 0, errNonLinux
}
// PerfEventMmapPage is a wrapper
type PerfEventMmapPage struct {
Version uint32
Compat_version uint32
Lock uint32
Index uint32
Offset int64
Time_enabled uint64
Time_running uint64
Capabilities uint64
Pmc_width uint16
Time_shift uint16
Time_mult uint32
Time_offset uint64
Time_zero uint64
Size uint32
Data_head uint64
Data_tail uint64
Data_offset uint64
Data_size uint64
Aux_head uint64
Aux_tail uint64
Aux_offset uint64
Aux_size uint64
}
// SetNonblock is a wrapper
func SetNonblock(fd int, nonblocking bool) (err error) {
return errNonLinux
}
// Mmap is a wrapper
func Mmap(fd int, offset int64, length int, prot int, flags int) (data []byte, err error) {
return []byte{}, errNonLinux
}
// Munmap is a wrapper
func Munmap(b []byte) (err error) {
return errNonLinux
}
// PerfEventAttr is a wrapper
type PerfEventAttr struct {
Type uint32
Size uint32
Config uint64
Sample uint64
Sample_type uint64
Read_format uint64
Bits uint64
Wakeup uint32
Bp_type uint32
Ext1 uint64
Ext2 uint64
Branch_sample_type uint64
Sample_regs_user uint64
Sample_stack_user uint32
Clockid int32
Sample_regs_intr uint64
Aux_watermark uint32
Sample_max_stack uint16
}
// PerfEventOpen is a wrapper
func PerfEventOpen(attr *PerfEventAttr, pid int, cpu int, groupFd int, flags int) (fd int, err error) {
return 0, errNonLinux
}
// Utsname is a wrapper
type Utsname struct {
Release [65]byte
Version [65]byte
}
// Uname is a wrapper
func Uname(buf *Utsname) (err error) {
return errNonLinux
}
// Getpid is a wrapper
func Getpid() int {
return -1
}
// Gettid is a wrapper
func Gettid() int {
return -1
}
// Tgkill is a wrapper
func Tgkill(tgid int, tid int, sig syscall.Signal) (err error) {
return errNonLinux
}
// BytePtrFromString is a wrapper
func BytePtrFromString(s string) (*byte, error) {
return nil, errNonLinux
}
// ByteSliceToString is a wrapper
func ByteSliceToString(s []byte) string {
return ""
}
// Renameat2 is a wrapper
func Renameat2(olddirfd int, oldpath string, newdirfd int, newpath string, flags uint) error {
return errNonLinux
}
func KernelRelease() (string, error) {
return "", errNonLinux
}

View File

@ -0,0 +1,163 @@
package internal
import (
"fmt"
"io/ioutil"
"regexp"
"sync"
"github.com/cilium/ebpf/internal/unix"
)
const (
// Version constant used in ELF binaries indicating that the loader needs to
// substitute the eBPF program's version with the value of the kernel's
// KERNEL_VERSION compile-time macro. Used for compatibility with BCC, gobpf
// and RedSift.
MagicKernelVersion = 0xFFFFFFFE
)
var (
// Match between one and three decimals separated by dots, with the last
// segment (patch level) being optional on some kernels.
// The x.y.z string must appear at the start of a string or right after
// whitespace to prevent sequences like 'x.y.z-a.b.c' from matching 'a.b.c'.
rgxKernelVersion = regexp.MustCompile(`(?:\A|\s)\d{1,3}\.\d{1,3}(?:\.\d{1,3})?`)
kernelVersion = struct {
once sync.Once
version Version
err error
}{}
)
// A Version in the form Major.Minor.Patch.
type Version [3]uint16
// NewVersion creates a version from a string like "Major.Minor.Patch".
//
// Patch is optional.
func NewVersion(ver string) (Version, error) {
var major, minor, patch uint16
n, _ := fmt.Sscanf(ver, "%d.%d.%d", &major, &minor, &patch)
if n < 2 {
return Version{}, fmt.Errorf("invalid version: %s", ver)
}
return Version{major, minor, patch}, nil
}
func (v Version) String() string {
if v[2] == 0 {
return fmt.Sprintf("v%d.%d", v[0], v[1])
}
return fmt.Sprintf("v%d.%d.%d", v[0], v[1], v[2])
}
// Less returns true if the version is less than another version.
func (v Version) Less(other Version) bool {
for i, a := range v {
if a == other[i] {
continue
}
return a < other[i]
}
return false
}
// Unspecified returns true if the version is all zero.
func (v Version) Unspecified() bool {
return v[0] == 0 && v[1] == 0 && v[2] == 0
}
// Kernel implements the kernel's KERNEL_VERSION macro from linux/version.h.
// It represents the kernel version and patch level as a single value.
func (v Version) Kernel() uint32 {
// Kernels 4.4 and 4.9 have their SUBLEVEL clamped to 255 to avoid
// overflowing into PATCHLEVEL.
// See kernel commit 9b82f13e7ef3 ("kbuild: clamp SUBLEVEL to 255").
s := v[2]
if s > 255 {
s = 255
}
// Truncate members to uint8 to prevent them from spilling over into
// each other when overflowing 8 bits.
return uint32(uint8(v[0]))<<16 | uint32(uint8(v[1]))<<8 | uint32(uint8(s))
}
// KernelVersion returns the version of the currently running kernel.
func KernelVersion() (Version, error) {
kernelVersion.once.Do(func() {
kernelVersion.version, kernelVersion.err = detectKernelVersion()
})
if kernelVersion.err != nil {
return Version{}, kernelVersion.err
}
return kernelVersion.version, nil
}
// detectKernelVersion returns the version of the running kernel. It scans the
// following sources in order: /proc/version_signature, uname -v, uname -r.
// In each of those locations, the last-appearing x.y(.z) value is selected
// for parsing. The first location that yields a usable version number is
// returned.
func detectKernelVersion() (Version, error) {
// Try reading /proc/version_signature for Ubuntu compatibility.
// Example format: Ubuntu 4.15.0-91.92-generic 4.15.18
// This method exists in the kernel itself, see d18acd15c
// ("perf tools: Fix kernel version error in ubuntu").
if pvs, err := ioutil.ReadFile("/proc/version_signature"); err == nil {
// If /proc/version_signature exists, failing to parse it is an error.
// It only exists on Ubuntu, where the real patch level is not obtainable
// through any other method.
v, err := findKernelVersion(string(pvs))
if err != nil {
return Version{}, err
}
return v, nil
}
var uname unix.Utsname
if err := unix.Uname(&uname); err != nil {
return Version{}, fmt.Errorf("calling uname: %w", err)
}
// Debian puts the version including the patch level in uname.Version.
// It is not an error if there's no version number in uname.Version,
// as most distributions don't use it. Parsing can continue on uname.Release.
// Example format: #1 SMP Debian 4.19.37-5+deb10u2 (2019-08-08)
if v, err := findKernelVersion(unix.ByteSliceToString(uname.Version[:])); err == nil {
return v, nil
}
// Most other distributions have the full kernel version including patch
// level in uname.Release.
// Example format: 4.19.0-5-amd64, 5.5.10-arch1-1
v, err := findKernelVersion(unix.ByteSliceToString(uname.Release[:]))
if err != nil {
return Version{}, err
}
return v, nil
}
// findKernelVersion matches s against rgxKernelVersion and parses the result
// into a Version. If s contains multiple matches, the last entry is selected.
func findKernelVersion(s string) (Version, error) {
m := rgxKernelVersion.FindAllString(s, -1)
if m == nil {
return Version{}, fmt.Errorf("no kernel version in string: %s", s)
}
// Pick the last match of the string in case there are multiple.
s = m[len(m)-1]
v, err := NewVersion(s)
if err != nil {
return Version{}, fmt.Errorf("parsing version string %s: %w", s, err)
}
return v, nil
}

View File

@ -0,0 +1,171 @@
package link
import (
"errors"
"fmt"
"os"
"github.com/cilium/ebpf"
)
type cgroupAttachFlags uint32
// cgroup attach flags
const (
flagAllowOverride cgroupAttachFlags = 1 << iota
flagAllowMulti
flagReplace
)
type CgroupOptions struct {
// Path to a cgroupv2 folder.
Path string
// One of the AttachCgroup* constants
Attach ebpf.AttachType
// Program must be of type CGroup*, and the attach type must match Attach.
Program *ebpf.Program
}
// AttachCgroup links a BPF program to a cgroup.
func AttachCgroup(opts CgroupOptions) (Link, error) {
cgroup, err := os.Open(opts.Path)
if err != nil {
return nil, fmt.Errorf("can't open cgroup: %s", err)
}
clone, err := opts.Program.Clone()
if err != nil {
cgroup.Close()
return nil, err
}
var cg Link
cg, err = newLinkCgroup(cgroup, opts.Attach, clone)
if errors.Is(err, ErrNotSupported) {
cg, err = newProgAttachCgroup(cgroup, opts.Attach, clone, flagAllowMulti)
}
if errors.Is(err, ErrNotSupported) {
cg, err = newProgAttachCgroup(cgroup, opts.Attach, clone, flagAllowOverride)
}
if err != nil {
cgroup.Close()
clone.Close()
return nil, err
}
return cg, nil
}
// LoadPinnedCgroup loads a pinned cgroup from a bpffs.
func LoadPinnedCgroup(fileName string, opts *ebpf.LoadPinOptions) (Link, error) {
link, err := LoadPinnedRawLink(fileName, CgroupType, opts)
if err != nil {
return nil, err
}
return &linkCgroup{*link}, nil
}
type progAttachCgroup struct {
cgroup *os.File
current *ebpf.Program
attachType ebpf.AttachType
flags cgroupAttachFlags
}
var _ Link = (*progAttachCgroup)(nil)
func (cg *progAttachCgroup) isLink() {}
func newProgAttachCgroup(cgroup *os.File, attach ebpf.AttachType, prog *ebpf.Program, flags cgroupAttachFlags) (*progAttachCgroup, error) {
if flags&flagAllowMulti > 0 {
if err := haveProgAttachReplace(); err != nil {
return nil, fmt.Errorf("can't support multiple programs: %w", err)
}
}
err := RawAttachProgram(RawAttachProgramOptions{
Target: int(cgroup.Fd()),
Program: prog,
Flags: uint32(flags),
Attach: attach,
})
if err != nil {
return nil, fmt.Errorf("cgroup: %w", err)
}
return &progAttachCgroup{cgroup, prog, attach, flags}, nil
}
func (cg *progAttachCgroup) Close() error {
defer cg.cgroup.Close()
defer cg.current.Close()
err := RawDetachProgram(RawDetachProgramOptions{
Target: int(cg.cgroup.Fd()),
Program: cg.current,
Attach: cg.attachType,
})
if err != nil {
return fmt.Errorf("close cgroup: %s", err)
}
return nil
}
func (cg *progAttachCgroup) Update(prog *ebpf.Program) error {
new, err := prog.Clone()
if err != nil {
return err
}
args := RawAttachProgramOptions{
Target: int(cg.cgroup.Fd()),
Program: prog,
Attach: cg.attachType,
Flags: uint32(cg.flags),
}
if cg.flags&flagAllowMulti > 0 {
// Atomically replacing multiple programs requires at least
// 5.5 (commit 7dd68b3279f17921 "bpf: Support replacing cgroup-bpf
// program in MULTI mode")
args.Flags |= uint32(flagReplace)
args.Replace = cg.current
}
if err := RawAttachProgram(args); err != nil {
new.Close()
return fmt.Errorf("can't update cgroup: %s", err)
}
cg.current.Close()
cg.current = new
return nil
}
func (cg *progAttachCgroup) Pin(string) error {
return fmt.Errorf("can't pin cgroup: %w", ErrNotSupported)
}
func (cg *progAttachCgroup) Unpin() error {
return fmt.Errorf("can't pin cgroup: %w", ErrNotSupported)
}
type linkCgroup struct {
RawLink
}
var _ Link = (*linkCgroup)(nil)
func newLinkCgroup(cgroup *os.File, attach ebpf.AttachType, prog *ebpf.Program) (*linkCgroup, error) {
link, err := AttachRawLink(RawLinkOptions{
Target: int(cgroup.Fd()),
Program: prog,
Attach: attach,
})
if err != nil {
return nil, err
}
return &linkCgroup{*link}, err
}

View File

@ -0,0 +1,2 @@
// Package link allows attaching eBPF programs to various kernel hooks.
package link

100
src/runtime/vendor/github.com/cilium/ebpf/link/iter.go generated vendored Normal file
View File

@ -0,0 +1,100 @@
package link
import (
"fmt"
"io"
"unsafe"
"github.com/cilium/ebpf"
"github.com/cilium/ebpf/internal"
)
type IterOptions struct {
// Program must be of type Tracing with attach type
// AttachTraceIter. The kind of iterator to attach to is
// determined at load time via the AttachTo field.
//
// AttachTo requires the kernel to include BTF of itself,
// and it to be compiled with a recent pahole (>= 1.16).
Program *ebpf.Program
// Map specifies the target map for bpf_map_elem and sockmap iterators.
// It may be nil.
Map *ebpf.Map
}
// AttachIter attaches a BPF seq_file iterator.
func AttachIter(opts IterOptions) (*Iter, error) {
if err := haveBPFLink(); err != nil {
return nil, err
}
progFd := opts.Program.FD()
if progFd < 0 {
return nil, fmt.Errorf("invalid program: %s", internal.ErrClosedFd)
}
var info bpfIterLinkInfoMap
if opts.Map != nil {
mapFd := opts.Map.FD()
if mapFd < 0 {
return nil, fmt.Errorf("invalid map: %w", internal.ErrClosedFd)
}
info.map_fd = uint32(mapFd)
}
attr := bpfLinkCreateIterAttr{
prog_fd: uint32(progFd),
attach_type: ebpf.AttachTraceIter,
iter_info: internal.NewPointer(unsafe.Pointer(&info)),
iter_info_len: uint32(unsafe.Sizeof(info)),
}
fd, err := bpfLinkCreateIter(&attr)
if err != nil {
return nil, fmt.Errorf("can't link iterator: %w", err)
}
return &Iter{RawLink{fd, ""}}, err
}
// LoadPinnedIter loads a pinned iterator from a bpffs.
func LoadPinnedIter(fileName string, opts *ebpf.LoadPinOptions) (*Iter, error) {
link, err := LoadPinnedRawLink(fileName, IterType, opts)
if err != nil {
return nil, err
}
return &Iter{*link}, err
}
// Iter represents an attached bpf_iter.
type Iter struct {
RawLink
}
// Open creates a new instance of the iterator.
//
// Reading from the returned reader triggers the BPF program.
func (it *Iter) Open() (io.ReadCloser, error) {
linkFd, err := it.fd.Value()
if err != nil {
return nil, err
}
attr := &bpfIterCreateAttr{
linkFd: linkFd,
}
fd, err := bpfIterCreate(attr)
if err != nil {
return nil, fmt.Errorf("can't create iterator: %w", err)
}
return fd.File("bpf_iter"), nil
}
// union bpf_iter_link_info.map
type bpfIterLinkInfoMap struct {
map_fd uint32
}

View File

@ -0,0 +1,438 @@
package link
import (
"bytes"
"crypto/rand"
"errors"
"fmt"
"io/ioutil"
"os"
"path/filepath"
"runtime"
"sync"
"unsafe"
"github.com/cilium/ebpf"
"github.com/cilium/ebpf/internal"
"github.com/cilium/ebpf/internal/unix"
)
var (
kprobeEventsPath = filepath.Join(tracefsPath, "kprobe_events")
kprobeRetprobeBit = struct {
once sync.Once
value uint64
err error
}{}
)
type probeType uint8
const (
kprobeType probeType = iota
uprobeType
)
func (pt probeType) String() string {
if pt == kprobeType {
return "kprobe"
}
return "uprobe"
}
func (pt probeType) EventsPath() string {
if pt == kprobeType {
return kprobeEventsPath
}
return uprobeEventsPath
}
func (pt probeType) PerfEventType(ret bool) perfEventType {
if pt == kprobeType {
if ret {
return kretprobeEvent
}
return kprobeEvent
}
if ret {
return uretprobeEvent
}
return uprobeEvent
}
func (pt probeType) RetprobeBit() (uint64, error) {
if pt == kprobeType {
return kretprobeBit()
}
return uretprobeBit()
}
// Kprobe attaches the given eBPF program to a perf event that fires when the
// given kernel symbol starts executing. See /proc/kallsyms for available
// symbols. For example, printk():
//
// Kprobe("printk", prog)
//
// The resulting Link must be Closed during program shutdown to avoid leaking
// system resources.
func Kprobe(symbol string, prog *ebpf.Program) (Link, error) {
k, err := kprobe(symbol, prog, false)
if err != nil {
return nil, err
}
err = k.attach(prog)
if err != nil {
k.Close()
return nil, err
}
return k, nil
}
// Kretprobe attaches the given eBPF program to a perf event that fires right
// before the given kernel symbol exits, with the function stack left intact.
// See /proc/kallsyms for available symbols. For example, printk():
//
// Kretprobe("printk", prog)
//
// The resulting Link must be Closed during program shutdown to avoid leaking
// system resources.
func Kretprobe(symbol string, prog *ebpf.Program) (Link, error) {
k, err := kprobe(symbol, prog, true)
if err != nil {
return nil, err
}
err = k.attach(prog)
if err != nil {
k.Close()
return nil, err
}
return k, nil
}
// kprobe opens a perf event on the given symbol and attaches prog to it.
// If ret is true, create a kretprobe.
func kprobe(symbol string, prog *ebpf.Program, ret bool) (*perfEvent, error) {
if symbol == "" {
return nil, fmt.Errorf("symbol name cannot be empty: %w", errInvalidInput)
}
if prog == nil {
return nil, fmt.Errorf("prog cannot be nil: %w", errInvalidInput)
}
if !rgxTraceEvent.MatchString(symbol) {
return nil, fmt.Errorf("symbol '%s' must be alphanumeric or underscore: %w", symbol, errInvalidInput)
}
if prog.Type() != ebpf.Kprobe {
return nil, fmt.Errorf("eBPF program type %s is not a Kprobe: %w", prog.Type(), errInvalidInput)
}
// Use kprobe PMU if the kernel has it available.
tp, err := pmuKprobe(platformPrefix(symbol), ret)
if errors.Is(err, os.ErrNotExist) {
tp, err = pmuKprobe(symbol, ret)
}
if err == nil {
return tp, nil
}
if err != nil && !errors.Is(err, ErrNotSupported) {
return nil, fmt.Errorf("creating perf_kprobe PMU: %w", err)
}
// Use tracefs if kprobe PMU is missing.
tp, err = tracefsKprobe(platformPrefix(symbol), ret)
if errors.Is(err, os.ErrNotExist) {
tp, err = tracefsKprobe(symbol, ret)
}
if err != nil {
return nil, fmt.Errorf("creating trace event '%s' in tracefs: %w", symbol, err)
}
return tp, nil
}
// pmuKprobe opens a perf event based on the kprobe PMU.
// Returns os.ErrNotExist if the given symbol does not exist in the kernel.
func pmuKprobe(symbol string, ret bool) (*perfEvent, error) {
return pmuProbe(kprobeType, symbol, "", 0, ret)
}
// pmuProbe opens a perf event based on a Performance Monitoring Unit.
//
// Requires at least a 4.17 kernel.
// e12f03d7031a "perf/core: Implement the 'perf_kprobe' PMU"
// 33ea4b24277b "perf/core: Implement the 'perf_uprobe' PMU"
//
// Returns ErrNotSupported if the kernel doesn't support perf_[k,u]probe PMU
func pmuProbe(typ probeType, symbol, path string, offset uint64, ret bool) (*perfEvent, error) {
// Getting the PMU type will fail if the kernel doesn't support
// the perf_[k,u]probe PMU.
et, err := getPMUEventType(typ)
if err != nil {
return nil, err
}
var config uint64
if ret {
bit, err := typ.RetprobeBit()
if err != nil {
return nil, err
}
config |= 1 << bit
}
var (
attr unix.PerfEventAttr
sp unsafe.Pointer
)
switch typ {
case kprobeType:
// Create a pointer to a NUL-terminated string for the kernel.
sp, err := unsafeStringPtr(symbol)
if err != nil {
return nil, err
}
attr = unix.PerfEventAttr{
Type: uint32(et), // PMU event type read from sysfs
Ext1: uint64(uintptr(sp)), // Kernel symbol to trace
Config: config, // Retprobe flag
}
case uprobeType:
sp, err := unsafeStringPtr(path)
if err != nil {
return nil, err
}
attr = unix.PerfEventAttr{
// The minimum size required for PMU uprobes is PERF_ATTR_SIZE_VER1,
// since it added the config2 (Ext2) field. The Size field controls the
// size of the internal buffer the kernel allocates for reading the
// perf_event_attr argument from userspace.
Size: unix.PERF_ATTR_SIZE_VER1,
Type: uint32(et), // PMU event type read from sysfs
Ext1: uint64(uintptr(sp)), // Uprobe path
Ext2: offset, // Uprobe offset
Config: config, // Retprobe flag
}
}
fd, err := unix.PerfEventOpen(&attr, perfAllThreads, 0, -1, unix.PERF_FLAG_FD_CLOEXEC)
// Since commit 97c753e62e6c, ENOENT is correctly returned instead of EINVAL
// when trying to create a kretprobe for a missing symbol. Make sure ENOENT
// is returned to the caller.
if errors.Is(err, os.ErrNotExist) || errors.Is(err, unix.EINVAL) {
return nil, fmt.Errorf("symbol '%s' not found: %w", symbol, os.ErrNotExist)
}
if err != nil {
return nil, fmt.Errorf("opening perf event: %w", err)
}
// Ensure the string pointer is not collected before PerfEventOpen returns.
runtime.KeepAlive(sp)
// Kernel has perf_[k,u]probe PMU available, initialize perf event.
return &perfEvent{
fd: internal.NewFD(uint32(fd)),
pmuID: et,
name: symbol,
typ: typ.PerfEventType(ret),
}, nil
}
// tracefsKprobe creates a Kprobe tracefs entry.
func tracefsKprobe(symbol string, ret bool) (*perfEvent, error) {
return tracefsProbe(kprobeType, symbol, "", 0, ret)
}
// tracefsProbe creates a trace event by writing an entry to <tracefs>/[k,u]probe_events.
// A new trace event group name is generated on every call to support creating
// multiple trace events for the same kernel or userspace symbol.
// Path and offset are only set in the case of uprobe(s) and are used to set
// the executable/library path on the filesystem and the offset where the probe is inserted.
// A perf event is then opened on the newly-created trace event and returned to the caller.
func tracefsProbe(typ probeType, symbol, path string, offset uint64, ret bool) (*perfEvent, error) {
// Generate a random string for each trace event we attempt to create.
// This value is used as the 'group' token in tracefs to allow creating
// multiple kprobe trace events with the same name.
group, err := randomGroup("ebpf")
if err != nil {
return nil, fmt.Errorf("randomizing group name: %w", err)
}
// Before attempting to create a trace event through tracefs,
// check if an event with the same group and name already exists.
// Kernels 4.x and earlier don't return os.ErrExist on writing a duplicate
// entry, so we need to rely on reads for detecting uniqueness.
_, err = getTraceEventID(group, symbol)
if err == nil {
return nil, fmt.Errorf("trace event already exists: %s/%s", group, symbol)
}
if err != nil && !errors.Is(err, os.ErrNotExist) {
return nil, fmt.Errorf("checking trace event %s/%s: %w", group, symbol, err)
}
// Create the [k,u]probe trace event using tracefs.
if err := createTraceFSProbeEvent(typ, group, symbol, path, offset, ret); err != nil {
return nil, fmt.Errorf("creating probe entry on tracefs: %w", err)
}
// Get the newly-created trace event's id.
tid, err := getTraceEventID(group, symbol)
if err != nil {
return nil, fmt.Errorf("getting trace event id: %w", err)
}
// Kprobes are ephemeral tracepoints and share the same perf event type.
fd, err := openTracepointPerfEvent(tid)
if err != nil {
return nil, err
}
return &perfEvent{
fd: fd,
group: group,
name: symbol,
tracefsID: tid,
typ: typ.PerfEventType(ret),
}, nil
}
// createTraceFSProbeEvent creates a new ephemeral trace event by writing to
// <tracefs>/[k,u]probe_events. Returns os.ErrNotExist if symbol is not a valid
// kernel symbol, or if it is not traceable with kprobes. Returns os.ErrExist
// if a probe with the same group and symbol already exists.
func createTraceFSProbeEvent(typ probeType, group, symbol, path string, offset uint64, ret bool) error {
// Open the kprobe_events file in tracefs.
f, err := os.OpenFile(typ.EventsPath(), os.O_APPEND|os.O_WRONLY, 0666)
if err != nil {
return fmt.Errorf("error opening '%s': %w", typ.EventsPath(), err)
}
defer f.Close()
var pe string
switch typ {
case kprobeType:
// The kprobe_events syntax is as follows (see Documentation/trace/kprobetrace.txt):
// p[:[GRP/]EVENT] [MOD:]SYM[+offs]|MEMADDR [FETCHARGS] : Set a probe
// r[MAXACTIVE][:[GRP/]EVENT] [MOD:]SYM[+0] [FETCHARGS] : Set a return probe
// -:[GRP/]EVENT : Clear a probe
//
// Some examples:
// r:ebpf_1234/r_my_kretprobe nf_conntrack_destroy
// p:ebpf_5678/p_my_kprobe __x64_sys_execve
//
// Leaving the kretprobe's MAXACTIVE set to 0 (or absent) will make the
// kernel default to NR_CPUS. This is desired in most eBPF cases since
// subsampling or rate limiting logic can be more accurately implemented in
// the eBPF program itself.
// See Documentation/kprobes.txt for more details.
pe = fmt.Sprintf("%s:%s/%s %s", probePrefix(ret), group, symbol, symbol)
case uprobeType:
// The uprobe_events syntax is as follows:
// p[:[GRP/]EVENT] PATH:OFFSET [FETCHARGS] : Set a probe
// r[:[GRP/]EVENT] PATH:OFFSET [FETCHARGS] : Set a return probe
// -:[GRP/]EVENT : Clear a probe
//
// Some examples:
// r:ebpf_1234/readline /bin/bash:0x12345
// p:ebpf_5678/main_mySymbol /bin/mybin:0x12345
//
// See Documentation/trace/uprobetracer.txt for more details.
pathOffset := uprobePathOffset(path, offset)
pe = fmt.Sprintf("%s:%s/%s %s", probePrefix(ret), group, symbol, pathOffset)
}
_, err = f.WriteString(pe)
// Since commit 97c753e62e6c, ENOENT is correctly returned instead of EINVAL
// when trying to create a kretprobe for a missing symbol. Make sure ENOENT
// is returned to the caller.
if errors.Is(err, os.ErrNotExist) || errors.Is(err, unix.EINVAL) {
return fmt.Errorf("symbol %s not found: %w", symbol, os.ErrNotExist)
}
if err != nil {
return fmt.Errorf("writing '%s' to '%s': %w", pe, typ.EventsPath(), err)
}
return nil
}
// closeTraceFSProbeEvent removes the [k,u]probe with the given type, group and symbol
// from <tracefs>/[k,u]probe_events.
func closeTraceFSProbeEvent(typ probeType, group, symbol string) error {
f, err := os.OpenFile(typ.EventsPath(), os.O_APPEND|os.O_WRONLY, 0666)
if err != nil {
return fmt.Errorf("error opening %s: %w", typ.EventsPath(), err)
}
defer f.Close()
// See [k,u]probe_events syntax above. The probe type does not need to be specified
// for removals.
pe := fmt.Sprintf("-:%s/%s", group, symbol)
if _, err = f.WriteString(pe); err != nil {
return fmt.Errorf("writing '%s' to '%s': %w", pe, typ.EventsPath(), err)
}
return nil
}
// randomGroup generates a pseudorandom string for use as a tracefs group name.
// Returns an error when the output string would exceed 63 characters (kernel
// limitation), when rand.Read() fails or when prefix contains characters not
// allowed by rgxTraceEvent.
func randomGroup(prefix string) (string, error) {
if !rgxTraceEvent.MatchString(prefix) {
return "", fmt.Errorf("prefix '%s' must be alphanumeric or underscore: %w", prefix, errInvalidInput)
}
b := make([]byte, 8)
if _, err := rand.Read(b); err != nil {
return "", fmt.Errorf("reading random bytes: %w", err)
}
group := fmt.Sprintf("%s_%x", prefix, b)
if len(group) > 63 {
return "", fmt.Errorf("group name '%s' cannot be longer than 63 characters: %w", group, errInvalidInput)
}
return group, nil
}
func probePrefix(ret bool) string {
if ret {
return "r"
}
return "p"
}
// determineRetprobeBit reads a Performance Monitoring Unit's retprobe bit
// from /sys/bus/event_source/devices/<pmu>/format/retprobe.
func determineRetprobeBit(typ probeType) (uint64, error) {
p := filepath.Join("/sys/bus/event_source/devices/", typ.String(), "/format/retprobe")
data, err := ioutil.ReadFile(p)
if err != nil {
return 0, err
}
var rp uint64
n, err := fmt.Sscanf(string(bytes.TrimSpace(data)), "config:%d", &rp)
if err != nil {
return 0, fmt.Errorf("parse retprobe bit: %w", err)
}
if n != 1 {
return 0, fmt.Errorf("parse retprobe bit: expected 1 item, got %d", n)
}
return rp, nil
}
func kretprobeBit() (uint64, error) {
kprobeRetprobeBit.once.Do(func() {
kprobeRetprobeBit.value, kprobeRetprobeBit.err = determineRetprobeBit(kprobeType)
})
return kprobeRetprobeBit.value, kprobeRetprobeBit.err
}

229
src/runtime/vendor/github.com/cilium/ebpf/link/link.go generated vendored Normal file
View File

@ -0,0 +1,229 @@
package link
import (
"fmt"
"unsafe"
"github.com/cilium/ebpf"
"github.com/cilium/ebpf/internal"
)
var ErrNotSupported = internal.ErrNotSupported
// Link represents a Program attached to a BPF hook.
type Link interface {
// Replace the current program with a new program.
//
// Passing a nil program is an error. May return an error wrapping ErrNotSupported.
Update(*ebpf.Program) error
// Persist a link by pinning it into a bpffs.
//
// May return an error wrapping ErrNotSupported.
Pin(string) error
// Undo a previous call to Pin.
//
// May return an error wrapping ErrNotSupported.
Unpin() error
// Close frees resources.
//
// The link will be broken unless it has been pinned. A link
// may continue past the lifetime of the process if Close is
// not called.
Close() error
// Prevent external users from implementing this interface.
isLink()
}
// ID uniquely identifies a BPF link.
type ID uint32
// RawLinkOptions control the creation of a raw link.
type RawLinkOptions struct {
// File descriptor to attach to. This differs for each attach type.
Target int
// Program to attach.
Program *ebpf.Program
// Attach must match the attach type of Program.
Attach ebpf.AttachType
}
// RawLinkInfo contains metadata on a link.
type RawLinkInfo struct {
Type Type
ID ID
Program ebpf.ProgramID
}
// RawLink is the low-level API to bpf_link.
//
// You should consider using the higher level interfaces in this
// package instead.
type RawLink struct {
fd *internal.FD
pinnedPath string
}
// AttachRawLink creates a raw link.
func AttachRawLink(opts RawLinkOptions) (*RawLink, error) {
if err := haveBPFLink(); err != nil {
return nil, err
}
if opts.Target < 0 {
return nil, fmt.Errorf("invalid target: %s", internal.ErrClosedFd)
}
progFd := opts.Program.FD()
if progFd < 0 {
return nil, fmt.Errorf("invalid program: %s", internal.ErrClosedFd)
}
attr := bpfLinkCreateAttr{
targetFd: uint32(opts.Target),
progFd: uint32(progFd),
attachType: opts.Attach,
}
fd, err := bpfLinkCreate(&attr)
if err != nil {
return nil, fmt.Errorf("can't create link: %s", err)
}
return &RawLink{fd, ""}, nil
}
// LoadPinnedRawLink loads a persisted link from a bpffs.
//
// Returns an error if the pinned link type doesn't match linkType. Pass
// UnspecifiedType to disable this behaviour.
func LoadPinnedRawLink(fileName string, linkType Type, opts *ebpf.LoadPinOptions) (*RawLink, error) {
fd, err := internal.BPFObjGet(fileName, opts.Marshal())
if err != nil {
return nil, fmt.Errorf("load pinned link: %w", err)
}
link := &RawLink{fd, fileName}
if linkType == UnspecifiedType {
return link, nil
}
info, err := link.Info()
if err != nil {
link.Close()
return nil, fmt.Errorf("get pinned link info: %s", err)
}
if info.Type != linkType {
link.Close()
return nil, fmt.Errorf("link type %v doesn't match %v", info.Type, linkType)
}
return link, nil
}
func (l *RawLink) isLink() {}
// FD returns the raw file descriptor.
func (l *RawLink) FD() int {
fd, err := l.fd.Value()
if err != nil {
return -1
}
return int(fd)
}
// Close breaks the link.
//
// Use Pin if you want to make the link persistent.
func (l *RawLink) Close() error {
return l.fd.Close()
}
// Pin persists a link past the lifetime of the process.
//
// Calling Close on a pinned Link will not break the link
// until the pin is removed.
func (l *RawLink) Pin(fileName string) error {
if err := internal.Pin(l.pinnedPath, fileName, l.fd); err != nil {
return err
}
l.pinnedPath = fileName
return nil
}
// Unpin implements the Link interface.
func (l *RawLink) Unpin() error {
if err := internal.Unpin(l.pinnedPath); err != nil {
return err
}
l.pinnedPath = ""
return nil
}
// Update implements the Link interface.
func (l *RawLink) Update(new *ebpf.Program) error {
return l.UpdateArgs(RawLinkUpdateOptions{
New: new,
})
}
// RawLinkUpdateOptions control the behaviour of RawLink.UpdateArgs.
type RawLinkUpdateOptions struct {
New *ebpf.Program
Old *ebpf.Program
Flags uint32
}
// UpdateArgs updates a link based on args.
func (l *RawLink) UpdateArgs(opts RawLinkUpdateOptions) error {
newFd := opts.New.FD()
if newFd < 0 {
return fmt.Errorf("invalid program: %s", internal.ErrClosedFd)
}
var oldFd int
if opts.Old != nil {
oldFd = opts.Old.FD()
if oldFd < 0 {
return fmt.Errorf("invalid replacement program: %s", internal.ErrClosedFd)
}
}
linkFd, err := l.fd.Value()
if err != nil {
return fmt.Errorf("can't update link: %s", err)
}
attr := bpfLinkUpdateAttr{
linkFd: linkFd,
newProgFd: uint32(newFd),
oldProgFd: uint32(oldFd),
flags: opts.Flags,
}
return bpfLinkUpdate(&attr)
}
// struct bpf_link_info
type bpfLinkInfo struct {
typ uint32
id uint32
prog_id uint32
}
// Info returns metadata about the link.
func (l *RawLink) Info() (*RawLinkInfo, error) {
var info bpfLinkInfo
err := internal.BPFObjGetInfoByFD(l.fd, unsafe.Pointer(&info), unsafe.Sizeof(info))
if err != nil {
return nil, fmt.Errorf("link info: %s", err)
}
return &RawLinkInfo{
Type(info.typ),
ID(info.id),
ebpf.ProgramID(info.prog_id),
}, nil
}

View File

@ -0,0 +1,60 @@
package link
import (
"fmt"
"github.com/cilium/ebpf"
)
// NetNsInfo contains metadata about a network namespace link.
type NetNsInfo struct {
RawLinkInfo
}
// NetNsLink is a program attached to a network namespace.
type NetNsLink struct {
*RawLink
}
// AttachNetNs attaches a program to a network namespace.
func AttachNetNs(ns int, prog *ebpf.Program) (*NetNsLink, error) {
var attach ebpf.AttachType
switch t := prog.Type(); t {
case ebpf.FlowDissector:
attach = ebpf.AttachFlowDissector
case ebpf.SkLookup:
attach = ebpf.AttachSkLookup
default:
return nil, fmt.Errorf("can't attach %v to network namespace", t)
}
link, err := AttachRawLink(RawLinkOptions{
Target: ns,
Program: prog,
Attach: attach,
})
if err != nil {
return nil, err
}
return &NetNsLink{link}, nil
}
// LoadPinnedNetNs loads a network namespace link from bpffs.
func LoadPinnedNetNs(fileName string, opts *ebpf.LoadPinOptions) (*NetNsLink, error) {
link, err := LoadPinnedRawLink(fileName, NetNsType, opts)
if err != nil {
return nil, err
}
return &NetNsLink{link}, nil
}
// Info returns information about the link.
func (nns *NetNsLink) Info() (*NetNsInfo, error) {
info, err := nns.RawLink.Info()
if err != nil {
return nil, err
}
return &NetNsInfo{*info}, nil
}

View File

@ -0,0 +1,273 @@
package link
import (
"bytes"
"errors"
"fmt"
"io/ioutil"
"os"
"path/filepath"
"regexp"
"runtime"
"strconv"
"strings"
"unsafe"
"github.com/cilium/ebpf"
"github.com/cilium/ebpf/internal"
"github.com/cilium/ebpf/internal/unix"
)
// Getting the terminology right is usually the hardest part. For posterity and
// for staying sane during implementation:
//
// - trace event: Representation of a kernel runtime hook. Filesystem entries
// under <tracefs>/events. Can be tracepoints (static), kprobes or uprobes.
// Can be instantiated into perf events (see below).
// - tracepoint: A predetermined hook point in the kernel. Exposed as trace
// events in (sub)directories under <tracefs>/events. Cannot be closed or
// removed, they are static.
// - k(ret)probe: Ephemeral trace events based on entry or exit points of
// exported kernel symbols. kprobe-based (tracefs) trace events can be
// created system-wide by writing to the <tracefs>/kprobe_events file, or
// they can be scoped to the current process by creating PMU perf events.
// - u(ret)probe: Ephemeral trace events based on user provides ELF binaries
// and offsets. uprobe-based (tracefs) trace events can be
// created system-wide by writing to the <tracefs>/uprobe_events file, or
// they can be scoped to the current process by creating PMU perf events.
// - perf event: An object instantiated based on an existing trace event or
// kernel symbol. Referred to by fd in userspace.
// Exactly one eBPF program can be attached to a perf event. Multiple perf
// events can be created from a single trace event. Closing a perf event
// stops any further invocations of the attached eBPF program.
var (
tracefsPath = "/sys/kernel/debug/tracing"
// Trace event groups, names and kernel symbols must adhere to this set
// of characters. Non-empty, first character must not be a number, all
// characters must be alphanumeric or underscore.
rgxTraceEvent = regexp.MustCompile("^[a-zA-Z_][0-9a-zA-Z_]*$")
errInvalidInput = errors.New("invalid input")
)
const (
perfAllThreads = -1
)
type perfEventType uint8
const (
tracepointEvent perfEventType = iota
kprobeEvent
kretprobeEvent
uprobeEvent
uretprobeEvent
)
// A perfEvent represents a perf event kernel object. Exactly one eBPF program
// can be attached to it. It is created based on a tracefs trace event or a
// Performance Monitoring Unit (PMU).
type perfEvent struct {
// Group and name of the tracepoint/kprobe/uprobe.
group string
name string
// PMU event ID read from sysfs. Valid IDs are non-zero.
pmuID uint64
// ID of the trace event read from tracefs. Valid IDs are non-zero.
tracefsID uint64
// The event type determines the types of programs that can be attached.
typ perfEventType
fd *internal.FD
}
func (pe *perfEvent) isLink() {}
func (pe *perfEvent) Pin(string) error {
return fmt.Errorf("pin perf event: %w", ErrNotSupported)
}
func (pe *perfEvent) Unpin() error {
return fmt.Errorf("unpin perf event: %w", ErrNotSupported)
}
// Since 4.15 (e87c6bc3852b "bpf: permit multiple bpf attachments for a single perf event"),
// calling PERF_EVENT_IOC_SET_BPF appends the given program to a prog_array
// owned by the perf event, which means multiple programs can be attached
// simultaneously.
//
// Before 4.15, calling PERF_EVENT_IOC_SET_BPF more than once on a perf event
// returns EEXIST.
//
// Detaching a program from a perf event is currently not possible, so a
// program replacement mechanism cannot be implemented for perf events.
func (pe *perfEvent) Update(prog *ebpf.Program) error {
return fmt.Errorf("can't replace eBPF program in perf event: %w", ErrNotSupported)
}
func (pe *perfEvent) Close() error {
if pe.fd == nil {
return nil
}
pfd, err := pe.fd.Value()
if err != nil {
return fmt.Errorf("getting perf event fd: %w", err)
}
err = unix.IoctlSetInt(int(pfd), unix.PERF_EVENT_IOC_DISABLE, 0)
if err != nil {
return fmt.Errorf("disabling perf event: %w", err)
}
err = pe.fd.Close()
if err != nil {
return fmt.Errorf("closing perf event fd: %w", err)
}
switch pe.typ {
case kprobeEvent, kretprobeEvent:
// Clean up kprobe tracefs entry.
if pe.tracefsID != 0 {
return closeTraceFSProbeEvent(kprobeType, pe.group, pe.name)
}
case uprobeEvent, uretprobeEvent:
// Clean up uprobe tracefs entry.
if pe.tracefsID != 0 {
return closeTraceFSProbeEvent(uprobeType, pe.group, pe.name)
}
case tracepointEvent:
// Tracepoint trace events don't hold any extra resources.
return nil
}
return nil
}
// attach the given eBPF prog to the perf event stored in pe.
// pe must contain a valid perf event fd.
// prog's type must match the program type stored in pe.
func (pe *perfEvent) attach(prog *ebpf.Program) error {
if prog == nil {
return errors.New("cannot attach a nil program")
}
if pe.fd == nil {
return errors.New("cannot attach to nil perf event")
}
if prog.FD() < 0 {
return fmt.Errorf("invalid program: %w", internal.ErrClosedFd)
}
switch pe.typ {
case kprobeEvent, kretprobeEvent, uprobeEvent, uretprobeEvent:
if t := prog.Type(); t != ebpf.Kprobe {
return fmt.Errorf("invalid program type (expected %s): %s", ebpf.Kprobe, t)
}
case tracepointEvent:
if t := prog.Type(); t != ebpf.TracePoint {
return fmt.Errorf("invalid program type (expected %s): %s", ebpf.TracePoint, t)
}
default:
return fmt.Errorf("unknown perf event type: %d", pe.typ)
}
// The ioctl below will fail when the fd is invalid.
kfd, _ := pe.fd.Value()
// Assign the eBPF program to the perf event.
err := unix.IoctlSetInt(int(kfd), unix.PERF_EVENT_IOC_SET_BPF, prog.FD())
if err != nil {
return fmt.Errorf("setting perf event bpf program: %w", err)
}
// PERF_EVENT_IOC_ENABLE and _DISABLE ignore their given values.
if err := unix.IoctlSetInt(int(kfd), unix.PERF_EVENT_IOC_ENABLE, 0); err != nil {
return fmt.Errorf("enable perf event: %s", err)
}
// Close the perf event when its reference is lost to avoid leaking system resources.
runtime.SetFinalizer(pe, (*perfEvent).Close)
return nil
}
// unsafeStringPtr returns an unsafe.Pointer to a NUL-terminated copy of str.
func unsafeStringPtr(str string) (unsafe.Pointer, error) {
p, err := unix.BytePtrFromString(str)
if err != nil {
return nil, err
}
return unsafe.Pointer(p), nil
}
// getTraceEventID reads a trace event's ID from tracefs given its group and name.
// group and name must be alphanumeric or underscore, as required by the kernel.
func getTraceEventID(group, name string) (uint64, error) {
tid, err := uint64FromFile(tracefsPath, "events", group, name, "id")
if errors.Is(err, os.ErrNotExist) {
return 0, fmt.Errorf("trace event %s/%s: %w", group, name, os.ErrNotExist)
}
if err != nil {
return 0, fmt.Errorf("reading trace event ID of %s/%s: %w", group, name, err)
}
return tid, nil
}
// getPMUEventType reads a Performance Monitoring Unit's type (numeric identifier)
// from /sys/bus/event_source/devices/<pmu>/type.
//
// Returns ErrNotSupported if the pmu type is not supported.
func getPMUEventType(typ probeType) (uint64, error) {
et, err := uint64FromFile("/sys/bus/event_source/devices", typ.String(), "type")
if errors.Is(err, os.ErrNotExist) {
return 0, fmt.Errorf("pmu type %s: %w", typ, ErrNotSupported)
}
if err != nil {
return 0, fmt.Errorf("reading pmu type %s: %w", typ, err)
}
return et, nil
}
// openTracepointPerfEvent opens a tracepoint-type perf event. System-wide
// [k,u]probes created by writing to <tracefs>/[k,u]probe_events are tracepoints
// behind the scenes, and can be attached to using these perf events.
func openTracepointPerfEvent(tid uint64) (*internal.FD, error) {
attr := unix.PerfEventAttr{
Type: unix.PERF_TYPE_TRACEPOINT,
Config: tid,
Sample_type: unix.PERF_SAMPLE_RAW,
Sample: 1,
Wakeup: 1,
}
fd, err := unix.PerfEventOpen(&attr, perfAllThreads, 0, -1, unix.PERF_FLAG_FD_CLOEXEC)
if err != nil {
return nil, fmt.Errorf("opening tracepoint perf event: %w", err)
}
return internal.NewFD(uint32(fd)), nil
}
// uint64FromFile reads a uint64 from a file. All elements of path are sanitized
// and joined onto base. Returns error if base no longer prefixes the path after
// joining all components.
func uint64FromFile(base string, path ...string) (uint64, error) {
l := filepath.Join(path...)
p := filepath.Join(base, l)
if !strings.HasPrefix(p, base) {
return 0, fmt.Errorf("path '%s' attempts to escape base path '%s': %w", l, base, errInvalidInput)
}
data, err := ioutil.ReadFile(p)
if err != nil {
return 0, fmt.Errorf("reading file %s: %w", p, err)
}
et := bytes.TrimSpace(data)
return strconv.ParseUint(string(et), 10, 64)
}

View File

@ -0,0 +1,25 @@
package link
import (
"fmt"
"runtime"
)
func platformPrefix(symbol string) string {
prefix := runtime.GOARCH
// per https://github.com/golang/go/blob/master/src/go/build/syslist.go
switch prefix {
case "386":
prefix = "ia32"
case "amd64", "amd64p32":
prefix = "x64"
case "arm64", "arm64be":
prefix = "arm64"
default:
return symbol
}
return fmt.Sprintf("__%s_%s", prefix, symbol)
}

View File

@ -0,0 +1,76 @@
package link
import (
"fmt"
"github.com/cilium/ebpf"
"github.com/cilium/ebpf/internal"
)
type RawAttachProgramOptions struct {
// File descriptor to attach to. This differs for each attach type.
Target int
// Program to attach.
Program *ebpf.Program
// Program to replace (cgroups).
Replace *ebpf.Program
// Attach must match the attach type of Program (and Replace).
Attach ebpf.AttachType
// Flags control the attach behaviour. This differs for each attach type.
Flags uint32
}
// RawAttachProgram is a low level wrapper around BPF_PROG_ATTACH.
//
// You should use one of the higher level abstractions available in this
// package if possible.
func RawAttachProgram(opts RawAttachProgramOptions) error {
if err := haveProgAttach(); err != nil {
return err
}
var replaceFd uint32
if opts.Replace != nil {
replaceFd = uint32(opts.Replace.FD())
}
attr := internal.BPFProgAttachAttr{
TargetFd: uint32(opts.Target),
AttachBpfFd: uint32(opts.Program.FD()),
ReplaceBpfFd: replaceFd,
AttachType: uint32(opts.Attach),
AttachFlags: uint32(opts.Flags),
}
if err := internal.BPFProgAttach(&attr); err != nil {
return fmt.Errorf("can't attach program: %w", err)
}
return nil
}
type RawDetachProgramOptions struct {
Target int
Program *ebpf.Program
Attach ebpf.AttachType
}
// RawDetachProgram is a low level wrapper around BPF_PROG_DETACH.
//
// You should use one of the higher level abstractions available in this
// package if possible.
func RawDetachProgram(opts RawDetachProgramOptions) error {
if err := haveProgAttach(); err != nil {
return err
}
attr := internal.BPFProgDetachAttr{
TargetFd: uint32(opts.Target),
AttachBpfFd: uint32(opts.Program.FD()),
AttachType: uint32(opts.Attach),
}
if err := internal.BPFProgDetach(&attr); err != nil {
return fmt.Errorf("can't detach program: %w", err)
}
return nil
}

View File

@ -0,0 +1,61 @@
package link
import (
"fmt"
"github.com/cilium/ebpf"
"github.com/cilium/ebpf/internal"
)
type RawTracepointOptions struct {
// Tracepoint name.
Name string
// Program must be of type RawTracepoint*
Program *ebpf.Program
}
// AttachRawTracepoint links a BPF program to a raw_tracepoint.
//
// Requires at least Linux 4.17.
func AttachRawTracepoint(opts RawTracepointOptions) (Link, error) {
if t := opts.Program.Type(); t != ebpf.RawTracepoint && t != ebpf.RawTracepointWritable {
return nil, fmt.Errorf("invalid program type %s, expected RawTracepoint(Writable)", t)
}
if opts.Program.FD() < 0 {
return nil, fmt.Errorf("invalid program: %w", internal.ErrClosedFd)
}
fd, err := bpfRawTracepointOpen(&bpfRawTracepointOpenAttr{
name: internal.NewStringPointer(opts.Name),
fd: uint32(opts.Program.FD()),
})
if err != nil {
return nil, err
}
return &progAttachRawTracepoint{fd: fd}, nil
}
type progAttachRawTracepoint struct {
fd *internal.FD
}
var _ Link = (*progAttachRawTracepoint)(nil)
func (rt *progAttachRawTracepoint) isLink() {}
func (rt *progAttachRawTracepoint) Close() error {
return rt.fd.Close()
}
func (rt *progAttachRawTracepoint) Update(_ *ebpf.Program) error {
return fmt.Errorf("can't update raw_tracepoint: %w", ErrNotSupported)
}
func (rt *progAttachRawTracepoint) Pin(_ string) error {
return fmt.Errorf("can't pin raw_tracepoint: %w", ErrNotSupported)
}
func (rt *progAttachRawTracepoint) Unpin() error {
return fmt.Errorf("unpin raw_tracepoint: %w", ErrNotSupported)
}

View File

@ -0,0 +1,190 @@
package link
import (
"errors"
"unsafe"
"github.com/cilium/ebpf"
"github.com/cilium/ebpf/asm"
"github.com/cilium/ebpf/internal"
"github.com/cilium/ebpf/internal/unix"
)
// Type is the kind of link.
type Type uint32
// Valid link types.
//
// Equivalent to enum bpf_link_type.
const (
UnspecifiedType Type = iota
RawTracepointType
TracingType
CgroupType
IterType
NetNsType
XDPType
)
var haveProgAttach = internal.FeatureTest("BPF_PROG_ATTACH", "4.10", func() error {
prog, err := ebpf.NewProgram(&ebpf.ProgramSpec{
Type: ebpf.CGroupSKB,
AttachType: ebpf.AttachCGroupInetIngress,
License: "MIT",
Instructions: asm.Instructions{
asm.Mov.Imm(asm.R0, 0),
asm.Return(),
},
})
if err != nil {
return internal.ErrNotSupported
}
// BPF_PROG_ATTACH was introduced at the same time as CGgroupSKB,
// so being able to load the program is enough to infer that we
// have the syscall.
prog.Close()
return nil
})
var haveProgAttachReplace = internal.FeatureTest("BPF_PROG_ATTACH atomic replacement", "5.5", func() error {
if err := haveProgAttach(); err != nil {
return err
}
prog, err := ebpf.NewProgram(&ebpf.ProgramSpec{
Type: ebpf.CGroupSKB,
AttachType: ebpf.AttachCGroupInetIngress,
License: "MIT",
Instructions: asm.Instructions{
asm.Mov.Imm(asm.R0, 0),
asm.Return(),
},
})
if err != nil {
return internal.ErrNotSupported
}
defer prog.Close()
// We know that we have BPF_PROG_ATTACH since we can load CGroupSKB programs.
// If passing BPF_F_REPLACE gives us EINVAL we know that the feature isn't
// present.
attr := internal.BPFProgAttachAttr{
// We rely on this being checked after attachFlags.
TargetFd: ^uint32(0),
AttachBpfFd: uint32(prog.FD()),
AttachType: uint32(ebpf.AttachCGroupInetIngress),
AttachFlags: uint32(flagReplace),
}
err = internal.BPFProgAttach(&attr)
if errors.Is(err, unix.EINVAL) {
return internal.ErrNotSupported
}
if errors.Is(err, unix.EBADF) {
return nil
}
return err
})
type bpfLinkCreateAttr struct {
progFd uint32
targetFd uint32
attachType ebpf.AttachType
flags uint32
}
func bpfLinkCreate(attr *bpfLinkCreateAttr) (*internal.FD, error) {
ptr, err := internal.BPF(internal.BPF_LINK_CREATE, unsafe.Pointer(attr), unsafe.Sizeof(*attr))
if err != nil {
return nil, err
}
return internal.NewFD(uint32(ptr)), nil
}
type bpfLinkCreateIterAttr struct {
prog_fd uint32
target_fd uint32
attach_type ebpf.AttachType
flags uint32
iter_info internal.Pointer
iter_info_len uint32
}
func bpfLinkCreateIter(attr *bpfLinkCreateIterAttr) (*internal.FD, error) {
ptr, err := internal.BPF(internal.BPF_LINK_CREATE, unsafe.Pointer(attr), unsafe.Sizeof(*attr))
if err != nil {
return nil, err
}
return internal.NewFD(uint32(ptr)), nil
}
type bpfLinkUpdateAttr struct {
linkFd uint32
newProgFd uint32
flags uint32
oldProgFd uint32
}
func bpfLinkUpdate(attr *bpfLinkUpdateAttr) error {
_, err := internal.BPF(internal.BPF_LINK_UPDATE, unsafe.Pointer(attr), unsafe.Sizeof(*attr))
return err
}
var haveBPFLink = internal.FeatureTest("bpf_link", "5.7", func() error {
prog, err := ebpf.NewProgram(&ebpf.ProgramSpec{
Type: ebpf.CGroupSKB,
AttachType: ebpf.AttachCGroupInetIngress,
License: "MIT",
Instructions: asm.Instructions{
asm.Mov.Imm(asm.R0, 0),
asm.Return(),
},
})
if err != nil {
return internal.ErrNotSupported
}
defer prog.Close()
attr := bpfLinkCreateAttr{
// This is a hopefully invalid file descriptor, which triggers EBADF.
targetFd: ^uint32(0),
progFd: uint32(prog.FD()),
attachType: ebpf.AttachCGroupInetIngress,
}
_, err = bpfLinkCreate(&attr)
if errors.Is(err, unix.EINVAL) {
return internal.ErrNotSupported
}
if errors.Is(err, unix.EBADF) {
return nil
}
return err
})
type bpfIterCreateAttr struct {
linkFd uint32
flags uint32
}
func bpfIterCreate(attr *bpfIterCreateAttr) (*internal.FD, error) {
ptr, err := internal.BPF(internal.BPF_ITER_CREATE, unsafe.Pointer(attr), unsafe.Sizeof(*attr))
if err == nil {
return internal.NewFD(uint32(ptr)), nil
}
return nil, err
}
type bpfRawTracepointOpenAttr struct {
name internal.Pointer
fd uint32
_ uint32
}
func bpfRawTracepointOpen(attr *bpfRawTracepointOpenAttr) (*internal.FD, error) {
ptr, err := internal.BPF(internal.BPF_RAW_TRACEPOINT_OPEN, unsafe.Pointer(attr), unsafe.Sizeof(*attr))
if err == nil {
return internal.NewFD(uint32(ptr)), nil
}
return nil, err
}

View File

@ -0,0 +1,56 @@
package link
import (
"fmt"
"github.com/cilium/ebpf"
)
// Tracepoint attaches the given eBPF program to the tracepoint with the given
// group and name. See /sys/kernel/debug/tracing/events to find available
// tracepoints. The top-level directory is the group, the event's subdirectory
// is the name. Example:
//
// Tracepoint("syscalls", "sys_enter_fork", prog)
//
// Note that attaching eBPF programs to syscalls (sys_enter_*/sys_exit_*) is
// only possible as of kernel 4.14 (commit cf5f5ce).
func Tracepoint(group, name string, prog *ebpf.Program) (Link, error) {
if group == "" || name == "" {
return nil, fmt.Errorf("group and name cannot be empty: %w", errInvalidInput)
}
if prog == nil {
return nil, fmt.Errorf("prog cannot be nil: %w", errInvalidInput)
}
if !rgxTraceEvent.MatchString(group) || !rgxTraceEvent.MatchString(name) {
return nil, fmt.Errorf("group and name '%s/%s' must be alphanumeric or underscore: %w", group, name, errInvalidInput)
}
if prog.Type() != ebpf.TracePoint {
return nil, fmt.Errorf("eBPF program type %s is not a Tracepoint: %w", prog.Type(), errInvalidInput)
}
tid, err := getTraceEventID(group, name)
if err != nil {
return nil, err
}
fd, err := openTracepointPerfEvent(tid)
if err != nil {
return nil, err
}
pe := &perfEvent{
fd: fd,
tracefsID: tid,
group: group,
name: name,
typ: tracepointEvent,
}
if err := pe.attach(prog); err != nil {
pe.Close()
return nil, err
}
return pe, nil
}

View File

@ -0,0 +1,237 @@
package link
import (
"debug/elf"
"errors"
"fmt"
"os"
"path/filepath"
"regexp"
"sync"
"github.com/cilium/ebpf"
"github.com/cilium/ebpf/internal"
)
var (
uprobeEventsPath = filepath.Join(tracefsPath, "uprobe_events")
// rgxUprobeSymbol is used to strip invalid characters from the uprobe symbol
// as they are not allowed to be used as the EVENT token in tracefs.
rgxUprobeSymbol = regexp.MustCompile("[^a-zA-Z0-9]+")
uprobeRetprobeBit = struct {
once sync.Once
value uint64
err error
}{}
)
// Executable defines an executable program on the filesystem.
type Executable struct {
// Path of the executable on the filesystem.
path string
// Parsed ELF symbols and dynamic symbols.
symbols map[string]elf.Symbol
}
// UprobeOptions defines additional parameters that will be used
// when loading Uprobes.
type UprobeOptions struct {
// Symbol offset. Must be provided in case of external symbols (shared libs).
// If set, overrides the offset eventually parsed from the executable.
Offset uint64
}
// To open a new Executable, use:
//
// OpenExecutable("/bin/bash")
//
// The returned value can then be used to open Uprobe(s).
func OpenExecutable(path string) (*Executable, error) {
if path == "" {
return nil, fmt.Errorf("path cannot be empty")
}
f, err := os.Open(path)
if err != nil {
return nil, fmt.Errorf("open file '%s': %w", path, err)
}
defer f.Close()
se, err := internal.NewSafeELFFile(f)
if err != nil {
return nil, fmt.Errorf("parse ELF file: %w", err)
}
var ex = Executable{
path: path,
symbols: make(map[string]elf.Symbol),
}
if err := ex.addSymbols(se.Symbols); err != nil {
return nil, err
}
if err := ex.addSymbols(se.DynamicSymbols); err != nil {
return nil, err
}
return &ex, nil
}
func (ex *Executable) addSymbols(f func() ([]elf.Symbol, error)) error {
// elf.Symbols and elf.DynamicSymbols return ErrNoSymbols if the section is not found.
syms, err := f()
if err != nil && !errors.Is(err, elf.ErrNoSymbols) {
return err
}
for _, s := range syms {
if elf.ST_TYPE(s.Info) != elf.STT_FUNC {
// Symbol not associated with a function or other executable code.
continue
}
ex.symbols[s.Name] = s
}
return nil
}
func (ex *Executable) symbol(symbol string) (*elf.Symbol, error) {
if s, ok := ex.symbols[symbol]; ok {
return &s, nil
}
return nil, fmt.Errorf("symbol %s not found", symbol)
}
// Uprobe attaches the given eBPF program to a perf event that fires when the
// given symbol starts executing in the given Executable.
// For example, /bin/bash::main():
//
// ex, _ = OpenExecutable("/bin/bash")
// ex.Uprobe("main", prog, nil)
//
// When using symbols which belongs to shared libraries,
// an offset must be provided via options:
//
// ex.Uprobe("main", prog, &UprobeOptions{Offset: 0x123})
//
// The resulting Link must be Closed during program shutdown to avoid leaking
// system resources. Functions provided by shared libraries can currently not
// be traced and will result in an ErrNotSupported.
func (ex *Executable) Uprobe(symbol string, prog *ebpf.Program, opts *UprobeOptions) (Link, error) {
u, err := ex.uprobe(symbol, prog, opts, false)
if err != nil {
return nil, err
}
err = u.attach(prog)
if err != nil {
u.Close()
return nil, err
}
return u, nil
}
// Uretprobe attaches the given eBPF program to a perf event that fires right
// before the given symbol exits. For example, /bin/bash::main():
//
// ex, _ = OpenExecutable("/bin/bash")
// ex.Uretprobe("main", prog, nil)
//
// When using symbols which belongs to shared libraries,
// an offset must be provided via options:
//
// ex.Uretprobe("main", prog, &UprobeOptions{Offset: 0x123})
//
// The resulting Link must be Closed during program shutdown to avoid leaking
// system resources. Functions provided by shared libraries can currently not
// be traced and will result in an ErrNotSupported.
func (ex *Executable) Uretprobe(symbol string, prog *ebpf.Program, opts *UprobeOptions) (Link, error) {
u, err := ex.uprobe(symbol, prog, opts, true)
if err != nil {
return nil, err
}
err = u.attach(prog)
if err != nil {
u.Close()
return nil, err
}
return u, nil
}
// uprobe opens a perf event for the given binary/symbol and attaches prog to it.
// If ret is true, create a uretprobe.
func (ex *Executable) uprobe(symbol string, prog *ebpf.Program, opts *UprobeOptions, ret bool) (*perfEvent, error) {
if prog == nil {
return nil, fmt.Errorf("prog cannot be nil: %w", errInvalidInput)
}
if prog.Type() != ebpf.Kprobe {
return nil, fmt.Errorf("eBPF program type %s is not Kprobe: %w", prog.Type(), errInvalidInput)
}
var offset uint64
if opts != nil && opts.Offset != 0 {
offset = opts.Offset
} else {
sym, err := ex.symbol(symbol)
if err != nil {
return nil, fmt.Errorf("symbol '%s' not found: %w", symbol, err)
}
// Symbols with location 0 from section undef are shared library calls and
// are relocated before the binary is executed. Dynamic linking is not
// implemented by the library, so mark this as unsupported for now.
if sym.Section == elf.SHN_UNDEF && sym.Value == 0 {
return nil, fmt.Errorf("cannot resolve %s library call '%s', "+
"consider providing the offset via options: %w", ex.path, symbol, ErrNotSupported)
}
offset = sym.Value
}
// Use uprobe PMU if the kernel has it available.
tp, err := pmuUprobe(symbol, ex.path, offset, ret)
if err == nil {
return tp, nil
}
if err != nil && !errors.Is(err, ErrNotSupported) {
return nil, fmt.Errorf("creating perf_uprobe PMU: %w", err)
}
// Use tracefs if uprobe PMU is missing.
tp, err = tracefsUprobe(uprobeSanitizedSymbol(symbol), ex.path, offset, ret)
if err != nil {
return nil, fmt.Errorf("creating trace event '%s:%s' in tracefs: %w", ex.path, symbol, err)
}
return tp, nil
}
// pmuUprobe opens a perf event based on the uprobe PMU.
func pmuUprobe(symbol, path string, offset uint64, ret bool) (*perfEvent, error) {
return pmuProbe(uprobeType, symbol, path, offset, ret)
}
// tracefsUprobe creates a Uprobe tracefs entry.
func tracefsUprobe(symbol, path string, offset uint64, ret bool) (*perfEvent, error) {
return tracefsProbe(uprobeType, symbol, path, offset, ret)
}
// uprobeSanitizedSymbol replaces every invalid characted for the tracefs api with an underscore.
func uprobeSanitizedSymbol(symbol string) string {
return rgxUprobeSymbol.ReplaceAllString(symbol, "_")
}
// uprobePathOffset creates the PATH:OFFSET token for the tracefs api.
func uprobePathOffset(path string, offset uint64) string {
return fmt.Sprintf("%s:%#x", path, offset)
}
func uretprobeBit() (uint64, error) {
uprobeRetprobeBit.once.Do(func() {
uprobeRetprobeBit.value, uprobeRetprobeBit.err = determineRetprobeBit(uprobeType)
})
return uprobeRetprobeBit.value, uprobeRetprobeBit.err
}

140
src/runtime/vendor/github.com/cilium/ebpf/linker.go generated vendored Normal file
View File

@ -0,0 +1,140 @@
package ebpf
import (
"fmt"
"github.com/cilium/ebpf/asm"
"github.com/cilium/ebpf/internal/btf"
)
// link resolves bpf-to-bpf calls.
//
// Each library may contain multiple functions / labels, and is only linked
// if prog references one of these functions.
//
// Libraries also linked.
func link(prog *ProgramSpec, libs []*ProgramSpec) error {
var (
linked = make(map[*ProgramSpec]bool)
pending = []asm.Instructions{prog.Instructions}
insns asm.Instructions
)
for len(pending) > 0 {
insns, pending = pending[0], pending[1:]
for _, lib := range libs {
if linked[lib] {
continue
}
needed, err := needSection(insns, lib.Instructions)
if err != nil {
return fmt.Errorf("linking %s: %w", lib.Name, err)
}
if !needed {
continue
}
linked[lib] = true
prog.Instructions = append(prog.Instructions, lib.Instructions...)
pending = append(pending, lib.Instructions)
if prog.BTF != nil && lib.BTF != nil {
if err := btf.ProgramAppend(prog.BTF, lib.BTF); err != nil {
return fmt.Errorf("linking BTF of %s: %w", lib.Name, err)
}
}
}
}
return nil
}
func needSection(insns, section asm.Instructions) (bool, error) {
// A map of symbols to the libraries which contain them.
symbols, err := section.SymbolOffsets()
if err != nil {
return false, err
}
for _, ins := range insns {
if ins.Reference == "" {
continue
}
if ins.OpCode.JumpOp() != asm.Call || ins.Src != asm.PseudoCall {
continue
}
if ins.Constant != -1 {
// This is already a valid call, no need to link again.
continue
}
if _, ok := symbols[ins.Reference]; !ok {
// Symbol isn't available in this section
continue
}
// At this point we know that at least one function in the
// library is called from insns, so we have to link it.
return true, nil
}
// None of the functions in the section are called.
return false, nil
}
func fixupJumpsAndCalls(insns asm.Instructions) error {
symbolOffsets := make(map[string]asm.RawInstructionOffset)
iter := insns.Iterate()
for iter.Next() {
ins := iter.Ins
if ins.Symbol == "" {
continue
}
if _, ok := symbolOffsets[ins.Symbol]; ok {
return fmt.Errorf("duplicate symbol %s", ins.Symbol)
}
symbolOffsets[ins.Symbol] = iter.Offset
}
iter = insns.Iterate()
for iter.Next() {
i := iter.Index
offset := iter.Offset
ins := iter.Ins
if ins.Reference == "" {
continue
}
switch {
case ins.IsFunctionCall() && ins.Constant == -1:
// Rewrite bpf to bpf call
callOffset, ok := symbolOffsets[ins.Reference]
if !ok {
return fmt.Errorf("call at %d: reference to missing symbol %q", i, ins.Reference)
}
ins.Constant = int64(callOffset - offset - 1)
case ins.OpCode.Class() == asm.JumpClass && ins.Offset == -1:
// Rewrite jump to label
jumpOffset, ok := symbolOffsets[ins.Reference]
if !ok {
return fmt.Errorf("jump at %d: reference to missing symbol %q", i, ins.Reference)
}
ins.Offset = int16(jumpOffset - offset - 1)
case ins.IsLoadFromMap() && ins.MapPtr() == -1:
return fmt.Errorf("map %s: %w", ins.Reference, errUnsatisfiedReference)
}
}
return nil
}

1232
src/runtime/vendor/github.com/cilium/ebpf/map.go generated vendored Normal file

File diff suppressed because it is too large Load Diff

218
src/runtime/vendor/github.com/cilium/ebpf/marshalers.go generated vendored Normal file
View File

@ -0,0 +1,218 @@
package ebpf
import (
"bytes"
"encoding"
"encoding/binary"
"errors"
"fmt"
"reflect"
"runtime"
"unsafe"
"github.com/cilium/ebpf/internal"
)
// marshalPtr converts an arbitrary value into a pointer suitable
// to be passed to the kernel.
//
// As an optimization, it returns the original value if it is an
// unsafe.Pointer.
func marshalPtr(data interface{}, length int) (internal.Pointer, error) {
if ptr, ok := data.(unsafe.Pointer); ok {
return internal.NewPointer(ptr), nil
}
buf, err := marshalBytes(data, length)
if err != nil {
return internal.Pointer{}, err
}
return internal.NewSlicePointer(buf), nil
}
// marshalBytes converts an arbitrary value into a byte buffer.
//
// Prefer using Map.marshalKey and Map.marshalValue if possible, since
// those have special cases that allow more types to be encoded.
//
// Returns an error if the given value isn't representable in exactly
// length bytes.
func marshalBytes(data interface{}, length int) (buf []byte, err error) {
switch value := data.(type) {
case encoding.BinaryMarshaler:
buf, err = value.MarshalBinary()
case string:
buf = []byte(value)
case []byte:
buf = value
case unsafe.Pointer:
err = errors.New("can't marshal from unsafe.Pointer")
case Map, *Map, Program, *Program:
err = fmt.Errorf("can't marshal %T", value)
default:
var wr bytes.Buffer
err = binary.Write(&wr, internal.NativeEndian, value)
if err != nil {
err = fmt.Errorf("encoding %T: %v", value, err)
}
buf = wr.Bytes()
}
if err != nil {
return nil, err
}
if len(buf) != length {
return nil, fmt.Errorf("%T doesn't marshal to %d bytes", data, length)
}
return buf, nil
}
func makeBuffer(dst interface{}, length int) (internal.Pointer, []byte) {
if ptr, ok := dst.(unsafe.Pointer); ok {
return internal.NewPointer(ptr), nil
}
buf := make([]byte, length)
return internal.NewSlicePointer(buf), buf
}
// unmarshalBytes converts a byte buffer into an arbitrary value.
//
// Prefer using Map.unmarshalKey and Map.unmarshalValue if possible, since
// those have special cases that allow more types to be encoded.
func unmarshalBytes(data interface{}, buf []byte) error {
switch value := data.(type) {
case unsafe.Pointer:
// This could be solved in Go 1.17 by unsafe.Slice instead. (https://github.com/golang/go/issues/19367)
// We could opt for removing unsafe.Pointer support in the lib as well.
sh := &reflect.SliceHeader{ //nolint:govet
Data: uintptr(value),
Len: len(buf),
Cap: len(buf),
}
dst := *(*[]byte)(unsafe.Pointer(sh))
copy(dst, buf)
runtime.KeepAlive(value)
return nil
case Map, *Map, Program, *Program:
return fmt.Errorf("can't unmarshal into %T", value)
case encoding.BinaryUnmarshaler:
return value.UnmarshalBinary(buf)
case *string:
*value = string(buf)
return nil
case *[]byte:
*value = buf
return nil
case string:
return errors.New("require pointer to string")
case []byte:
return errors.New("require pointer to []byte")
default:
rd := bytes.NewReader(buf)
if err := binary.Read(rd, internal.NativeEndian, value); err != nil {
return fmt.Errorf("decoding %T: %v", value, err)
}
return nil
}
}
// marshalPerCPUValue encodes a slice containing one value per
// possible CPU into a buffer of bytes.
//
// Values are initialized to zero if the slice has less elements than CPUs.
//
// slice must have a type like []elementType.
func marshalPerCPUValue(slice interface{}, elemLength int) (internal.Pointer, error) {
sliceType := reflect.TypeOf(slice)
if sliceType.Kind() != reflect.Slice {
return internal.Pointer{}, errors.New("per-CPU value requires slice")
}
possibleCPUs, err := internal.PossibleCPUs()
if err != nil {
return internal.Pointer{}, err
}
sliceValue := reflect.ValueOf(slice)
sliceLen := sliceValue.Len()
if sliceLen > possibleCPUs {
return internal.Pointer{}, fmt.Errorf("per-CPU value exceeds number of CPUs")
}
alignedElemLength := align(elemLength, 8)
buf := make([]byte, alignedElemLength*possibleCPUs)
for i := 0; i < sliceLen; i++ {
elem := sliceValue.Index(i).Interface()
elemBytes, err := marshalBytes(elem, elemLength)
if err != nil {
return internal.Pointer{}, err
}
offset := i * alignedElemLength
copy(buf[offset:offset+elemLength], elemBytes)
}
return internal.NewSlicePointer(buf), nil
}
// unmarshalPerCPUValue decodes a buffer into a slice containing one value per
// possible CPU.
//
// valueOut must have a type like *[]elementType
func unmarshalPerCPUValue(slicePtr interface{}, elemLength int, buf []byte) error {
slicePtrType := reflect.TypeOf(slicePtr)
if slicePtrType.Kind() != reflect.Ptr || slicePtrType.Elem().Kind() != reflect.Slice {
return fmt.Errorf("per-cpu value requires pointer to slice")
}
possibleCPUs, err := internal.PossibleCPUs()
if err != nil {
return err
}
sliceType := slicePtrType.Elem()
slice := reflect.MakeSlice(sliceType, possibleCPUs, possibleCPUs)
sliceElemType := sliceType.Elem()
sliceElemIsPointer := sliceElemType.Kind() == reflect.Ptr
if sliceElemIsPointer {
sliceElemType = sliceElemType.Elem()
}
step := len(buf) / possibleCPUs
if step < elemLength {
return fmt.Errorf("per-cpu element length is larger than available data")
}
for i := 0; i < possibleCPUs; i++ {
var elem interface{}
if sliceElemIsPointer {
newElem := reflect.New(sliceElemType)
slice.Index(i).Set(newElem)
elem = newElem.Interface()
} else {
elem = slice.Index(i).Addr().Interface()
}
// Make a copy, since unmarshal can hold on to itemBytes
elemBytes := make([]byte, elemLength)
copy(elemBytes, buf[:elemLength])
err := unmarshalBytes(elem, elemBytes)
if err != nil {
return fmt.Errorf("cpu %d: %w", i, err)
}
buf = buf[step:]
}
reflect.ValueOf(slicePtr).Elem().Set(slice)
return nil
}
func align(n, alignment int) int {
return (int(n) + alignment - 1) / alignment * alignment
}

728
src/runtime/vendor/github.com/cilium/ebpf/prog.go generated vendored Normal file
View File

@ -0,0 +1,728 @@
package ebpf
import (
"bytes"
"encoding/binary"
"errors"
"fmt"
"io"
"math"
"path/filepath"
"strings"
"time"
"github.com/cilium/ebpf/asm"
"github.com/cilium/ebpf/internal"
"github.com/cilium/ebpf/internal/btf"
"github.com/cilium/ebpf/internal/unix"
)
// ErrNotSupported is returned whenever the kernel doesn't support a feature.
var ErrNotSupported = internal.ErrNotSupported
var errUnsatisfiedReference = errors.New("unsatisfied reference")
// ProgramID represents the unique ID of an eBPF program.
type ProgramID uint32
const (
// Number of bytes to pad the output buffer for BPF_PROG_TEST_RUN.
// This is currently the maximum of spare space allocated for SKB
// and XDP programs, and equal to XDP_PACKET_HEADROOM + NET_IP_ALIGN.
outputPad = 256 + 2
)
// DefaultVerifierLogSize is the default number of bytes allocated for the
// verifier log.
const DefaultVerifierLogSize = 64 * 1024
// ProgramOptions control loading a program into the kernel.
type ProgramOptions struct {
// Controls the detail emitted by the kernel verifier. Set to non-zero
// to enable logging.
LogLevel uint32
// Controls the output buffer size for the verifier. Defaults to
// DefaultVerifierLogSize.
LogSize int
// An ELF containing the target BTF for this program. It is used both to
// find the correct function to trace and to apply CO-RE relocations.
// This is useful in environments where the kernel BTF is not available
// (containers) or where it is in a non-standard location. Defaults to
// use the kernel BTF from a well-known location.
TargetBTF io.ReaderAt
}
// ProgramSpec defines a Program.
type ProgramSpec struct {
// Name is passed to the kernel as a debug aid. Must only contain
// alpha numeric and '_' characters.
Name string
// Type determines at which hook in the kernel a program will run.
Type ProgramType
AttachType AttachType
// Name of a kernel data structure to attach to. It's interpretation
// depends on Type and AttachType.
AttachTo string
Instructions asm.Instructions
// Flags is passed to the kernel and specifies additional program
// load attributes.
Flags uint32
// License of the program. Some helpers are only available if
// the license is deemed compatible with the GPL.
//
// See https://www.kernel.org/doc/html/latest/process/license-rules.html#id1
License string
// Version used by Kprobe programs.
//
// Deprecated on kernels 5.0 and later. Leave empty to let the library
// detect this value automatically.
KernelVersion uint32
// The BTF associated with this program. Changing Instructions
// will most likely invalidate the contained data, and may
// result in errors when attempting to load it into the kernel.
BTF *btf.Program
// The byte order this program was compiled for, may be nil.
ByteOrder binary.ByteOrder
}
// Copy returns a copy of the spec.
func (ps *ProgramSpec) Copy() *ProgramSpec {
if ps == nil {
return nil
}
cpy := *ps
cpy.Instructions = make(asm.Instructions, len(ps.Instructions))
copy(cpy.Instructions, ps.Instructions)
return &cpy
}
// Tag calculates the kernel tag for a series of instructions.
//
// Use asm.Instructions.Tag if you need to calculate for non-native endianness.
func (ps *ProgramSpec) Tag() (string, error) {
return ps.Instructions.Tag(internal.NativeEndian)
}
// Program represents BPF program loaded into the kernel.
//
// It is not safe to close a Program which is used by other goroutines.
type Program struct {
// Contains the output of the kernel verifier if enabled,
// otherwise it is empty.
VerifierLog string
fd *internal.FD
name string
pinnedPath string
typ ProgramType
}
// NewProgram creates a new Program.
//
// Loading a program for the first time will perform
// feature detection by loading small, temporary programs.
func NewProgram(spec *ProgramSpec) (*Program, error) {
return NewProgramWithOptions(spec, ProgramOptions{})
}
// NewProgramWithOptions creates a new Program.
//
// Loading a program for the first time will perform
// feature detection by loading small, temporary programs.
func NewProgramWithOptions(spec *ProgramSpec, opts ProgramOptions) (*Program, error) {
handles := newHandleCache()
defer handles.close()
prog, err := newProgramWithOptions(spec, opts, handles)
if errors.Is(err, errUnsatisfiedReference) {
return nil, fmt.Errorf("cannot load program without loading its whole collection: %w", err)
}
return prog, err
}
func newProgramWithOptions(spec *ProgramSpec, opts ProgramOptions, handles *handleCache) (*Program, error) {
if len(spec.Instructions) == 0 {
return nil, errors.New("Instructions cannot be empty")
}
if spec.ByteOrder != nil && spec.ByteOrder != internal.NativeEndian {
return nil, fmt.Errorf("can't load %s program on %s", spec.ByteOrder, internal.NativeEndian)
}
// Kernels before 5.0 (6c4fc209fcf9 "bpf: remove useless version check for prog load")
// require the version field to be set to the value of the KERNEL_VERSION
// macro for kprobe-type programs.
// Overwrite Kprobe program version if set to zero or the magic version constant.
kv := spec.KernelVersion
if spec.Type == Kprobe && (kv == 0 || kv == internal.MagicKernelVersion) {
v, err := internal.KernelVersion()
if err != nil {
return nil, fmt.Errorf("detecting kernel version: %w", err)
}
kv = v.Kernel()
}
attr := &bpfProgLoadAttr{
progType: spec.Type,
progFlags: spec.Flags,
expectedAttachType: spec.AttachType,
license: internal.NewStringPointer(spec.License),
kernelVersion: kv,
}
if haveObjName() == nil {
attr.progName = internal.NewBPFObjName(spec.Name)
}
var err error
var targetBTF *btf.Spec
if opts.TargetBTF != nil {
targetBTF, err = handles.btfSpec(opts.TargetBTF)
if err != nil {
return nil, fmt.Errorf("load target BTF: %w", err)
}
}
var btfDisabled bool
var core btf.COREFixups
if spec.BTF != nil {
core, err = btf.ProgramFixups(spec.BTF, targetBTF)
if err != nil {
return nil, fmt.Errorf("CO-RE relocations: %w", err)
}
handle, err := handles.btfHandle(btf.ProgramSpec(spec.BTF))
btfDisabled = errors.Is(err, btf.ErrNotSupported)
if err != nil && !btfDisabled {
return nil, fmt.Errorf("load BTF: %w", err)
}
if handle != nil {
attr.progBTFFd = uint32(handle.FD())
recSize, bytes, err := btf.ProgramLineInfos(spec.BTF)
if err != nil {
return nil, fmt.Errorf("get BTF line infos: %w", err)
}
attr.lineInfoRecSize = recSize
attr.lineInfoCnt = uint32(uint64(len(bytes)) / uint64(recSize))
attr.lineInfo = internal.NewSlicePointer(bytes)
recSize, bytes, err = btf.ProgramFuncInfos(spec.BTF)
if err != nil {
return nil, fmt.Errorf("get BTF function infos: %w", err)
}
attr.funcInfoRecSize = recSize
attr.funcInfoCnt = uint32(uint64(len(bytes)) / uint64(recSize))
attr.funcInfo = internal.NewSlicePointer(bytes)
}
}
insns, err := core.Apply(spec.Instructions)
if err != nil {
return nil, fmt.Errorf("CO-RE fixup: %w", err)
}
if err := fixupJumpsAndCalls(insns); err != nil {
return nil, err
}
buf := bytes.NewBuffer(make([]byte, 0, len(spec.Instructions)*asm.InstructionSize))
err = insns.Marshal(buf, internal.NativeEndian)
if err != nil {
return nil, err
}
bytecode := buf.Bytes()
attr.instructions = internal.NewSlicePointer(bytecode)
attr.insCount = uint32(len(bytecode) / asm.InstructionSize)
if spec.AttachTo != "" {
target, err := resolveBTFType(targetBTF, spec.AttachTo, spec.Type, spec.AttachType)
if err != nil {
return nil, err
}
if target != nil {
attr.attachBTFID = target.ID()
}
}
logSize := DefaultVerifierLogSize
if opts.LogSize > 0 {
logSize = opts.LogSize
}
var logBuf []byte
if opts.LogLevel > 0 {
logBuf = make([]byte, logSize)
attr.logLevel = opts.LogLevel
attr.logSize = uint32(len(logBuf))
attr.logBuf = internal.NewSlicePointer(logBuf)
}
fd, err := bpfProgLoad(attr)
if err == nil {
return &Program{internal.CString(logBuf), fd, spec.Name, "", spec.Type}, nil
}
logErr := err
if opts.LogLevel == 0 && opts.LogSize >= 0 {
// Re-run with the verifier enabled to get better error messages.
logBuf = make([]byte, logSize)
attr.logLevel = 1
attr.logSize = uint32(len(logBuf))
attr.logBuf = internal.NewSlicePointer(logBuf)
_, logErr = bpfProgLoad(attr)
}
if errors.Is(logErr, unix.EPERM) && logBuf[0] == 0 {
// EPERM due to RLIMIT_MEMLOCK happens before the verifier, so we can
// check that the log is empty to reduce false positives.
return nil, fmt.Errorf("load program: RLIMIT_MEMLOCK may be too low: %w", logErr)
}
err = internal.ErrorWithLog(err, logBuf, logErr)
if btfDisabled {
return nil, fmt.Errorf("load program without BTF: %w", err)
}
return nil, fmt.Errorf("load program: %w", err)
}
// NewProgramFromFD creates a program from a raw fd.
//
// You should not use fd after calling this function.
//
// Requires at least Linux 4.10.
func NewProgramFromFD(fd int) (*Program, error) {
if fd < 0 {
return nil, errors.New("invalid fd")
}
return newProgramFromFD(internal.NewFD(uint32(fd)))
}
// NewProgramFromID returns the program for a given id.
//
// Returns ErrNotExist, if there is no eBPF program with the given id.
func NewProgramFromID(id ProgramID) (*Program, error) {
fd, err := bpfObjGetFDByID(internal.BPF_PROG_GET_FD_BY_ID, uint32(id))
if err != nil {
return nil, fmt.Errorf("get program by id: %w", err)
}
return newProgramFromFD(fd)
}
func newProgramFromFD(fd *internal.FD) (*Program, error) {
info, err := newProgramInfoFromFd(fd)
if err != nil {
fd.Close()
return nil, fmt.Errorf("discover program type: %w", err)
}
return &Program{"", fd, "", "", info.Type}, nil
}
func (p *Program) String() string {
if p.name != "" {
return fmt.Sprintf("%s(%s)#%v", p.typ, p.name, p.fd)
}
return fmt.Sprintf("%s(%v)", p.typ, p.fd)
}
// Type returns the underlying type of the program.
func (p *Program) Type() ProgramType {
return p.typ
}
// Info returns metadata about the program.
//
// Requires at least 4.10.
func (p *Program) Info() (*ProgramInfo, error) {
return newProgramInfoFromFd(p.fd)
}
// FD gets the file descriptor of the Program.
//
// It is invalid to call this function after Close has been called.
func (p *Program) FD() int {
fd, err := p.fd.Value()
if err != nil {
// Best effort: -1 is the number most likely to be an
// invalid file descriptor.
return -1
}
return int(fd)
}
// Clone creates a duplicate of the Program.
//
// Closing the duplicate does not affect the original, and vice versa.
//
// Cloning a nil Program returns nil.
func (p *Program) Clone() (*Program, error) {
if p == nil {
return nil, nil
}
dup, err := p.fd.Dup()
if err != nil {
return nil, fmt.Errorf("can't clone program: %w", err)
}
return &Program{p.VerifierLog, dup, p.name, "", p.typ}, nil
}
// Pin persists the Program on the BPF virtual file system past the lifetime of
// the process that created it
//
// Calling Pin on a previously pinned program will overwrite the path, except when
// the new path already exists. Re-pinning across filesystems is not supported.
//
// This requires bpffs to be mounted above fileName. See https://docs.cilium.io/en/k8s-doc/admin/#admin-mount-bpffs
func (p *Program) Pin(fileName string) error {
if err := internal.Pin(p.pinnedPath, fileName, p.fd); err != nil {
return err
}
p.pinnedPath = fileName
return nil
}
// Unpin removes the persisted state for the Program from the BPF virtual filesystem.
//
// Failed calls to Unpin will not alter the state returned by IsPinned.
//
// Unpinning an unpinned Program returns nil.
func (p *Program) Unpin() error {
if err := internal.Unpin(p.pinnedPath); err != nil {
return err
}
p.pinnedPath = ""
return nil
}
// IsPinned returns true if the Program has a non-empty pinned path.
func (p *Program) IsPinned() bool {
return p.pinnedPath != ""
}
// Close unloads the program from the kernel.
func (p *Program) Close() error {
if p == nil {
return nil
}
return p.fd.Close()
}
// Test runs the Program in the kernel with the given input and returns the
// value returned by the eBPF program. outLen may be zero.
//
// Note: the kernel expects at least 14 bytes input for an ethernet header for
// XDP and SKB programs.
//
// This function requires at least Linux 4.12.
func (p *Program) Test(in []byte) (uint32, []byte, error) {
ret, out, _, err := p.testRun(in, 1, nil)
if err != nil {
return ret, nil, fmt.Errorf("can't test program: %w", err)
}
return ret, out, nil
}
// Benchmark runs the Program with the given input for a number of times
// and returns the time taken per iteration.
//
// Returns the result of the last execution of the program and the time per
// run or an error. reset is called whenever the benchmark syscall is
// interrupted, and should be set to testing.B.ResetTimer or similar.
//
// Note: profiling a call to this function will skew it's results, see
// https://github.com/cilium/ebpf/issues/24
//
// This function requires at least Linux 4.12.
func (p *Program) Benchmark(in []byte, repeat int, reset func()) (uint32, time.Duration, error) {
ret, _, total, err := p.testRun(in, repeat, reset)
if err != nil {
return ret, total, fmt.Errorf("can't benchmark program: %w", err)
}
return ret, total, nil
}
var haveProgTestRun = internal.FeatureTest("BPF_PROG_TEST_RUN", "4.12", func() error {
prog, err := NewProgram(&ProgramSpec{
Type: SocketFilter,
Instructions: asm.Instructions{
asm.LoadImm(asm.R0, 0, asm.DWord),
asm.Return(),
},
License: "MIT",
})
if err != nil {
// This may be because we lack sufficient permissions, etc.
return err
}
defer prog.Close()
// Programs require at least 14 bytes input
in := make([]byte, 14)
attr := bpfProgTestRunAttr{
fd: uint32(prog.FD()),
dataSizeIn: uint32(len(in)),
dataIn: internal.NewSlicePointer(in),
}
err = bpfProgTestRun(&attr)
if errors.Is(err, unix.EINVAL) {
// Check for EINVAL specifically, rather than err != nil since we
// otherwise misdetect due to insufficient permissions.
return internal.ErrNotSupported
}
if errors.Is(err, unix.EINTR) {
// We know that PROG_TEST_RUN is supported if we get EINTR.
return nil
}
return err
})
func (p *Program) testRun(in []byte, repeat int, reset func()) (uint32, []byte, time.Duration, error) {
if uint(repeat) > math.MaxUint32 {
return 0, nil, 0, fmt.Errorf("repeat is too high")
}
if len(in) == 0 {
return 0, nil, 0, fmt.Errorf("missing input")
}
if uint(len(in)) > math.MaxUint32 {
return 0, nil, 0, fmt.Errorf("input is too long")
}
if err := haveProgTestRun(); err != nil {
return 0, nil, 0, err
}
// Older kernels ignore the dataSizeOut argument when copying to user space.
// Combined with things like bpf_xdp_adjust_head() we don't really know what the final
// size will be. Hence we allocate an output buffer which we hope will always be large
// enough, and panic if the kernel wrote past the end of the allocation.
// See https://patchwork.ozlabs.org/cover/1006822/
out := make([]byte, len(in)+outputPad)
fd, err := p.fd.Value()
if err != nil {
return 0, nil, 0, err
}
attr := bpfProgTestRunAttr{
fd: fd,
dataSizeIn: uint32(len(in)),
dataSizeOut: uint32(len(out)),
dataIn: internal.NewSlicePointer(in),
dataOut: internal.NewSlicePointer(out),
repeat: uint32(repeat),
}
for {
err = bpfProgTestRun(&attr)
if err == nil {
break
}
if errors.Is(err, unix.EINTR) {
if reset != nil {
reset()
}
continue
}
return 0, nil, 0, fmt.Errorf("can't run test: %w", err)
}
if int(attr.dataSizeOut) > cap(out) {
// Houston, we have a problem. The program created more data than we allocated,
// and the kernel wrote past the end of our buffer.
panic("kernel wrote past end of output buffer")
}
out = out[:int(attr.dataSizeOut)]
total := time.Duration(attr.duration) * time.Nanosecond
return attr.retval, out, total, nil
}
func unmarshalProgram(buf []byte) (*Program, error) {
if len(buf) != 4 {
return nil, errors.New("program id requires 4 byte value")
}
// Looking up an entry in a nested map or prog array returns an id,
// not an fd.
id := internal.NativeEndian.Uint32(buf)
return NewProgramFromID(ProgramID(id))
}
func marshalProgram(p *Program, length int) ([]byte, error) {
if length != 4 {
return nil, fmt.Errorf("can't marshal program to %d bytes", length)
}
value, err := p.fd.Value()
if err != nil {
return nil, err
}
buf := make([]byte, 4)
internal.NativeEndian.PutUint32(buf, value)
return buf, nil
}
// Attach a Program.
//
// Deprecated: use link.RawAttachProgram instead.
func (p *Program) Attach(fd int, typ AttachType, flags AttachFlags) error {
if fd < 0 {
return errors.New("invalid fd")
}
pfd, err := p.fd.Value()
if err != nil {
return err
}
attr := internal.BPFProgAttachAttr{
TargetFd: uint32(fd),
AttachBpfFd: pfd,
AttachType: uint32(typ),
AttachFlags: uint32(flags),
}
return internal.BPFProgAttach(&attr)
}
// Detach a Program.
//
// Deprecated: use link.RawDetachProgram instead.
func (p *Program) Detach(fd int, typ AttachType, flags AttachFlags) error {
if fd < 0 {
return errors.New("invalid fd")
}
if flags != 0 {
return errors.New("flags must be zero")
}
pfd, err := p.fd.Value()
if err != nil {
return err
}
attr := internal.BPFProgDetachAttr{
TargetFd: uint32(fd),
AttachBpfFd: pfd,
AttachType: uint32(typ),
}
return internal.BPFProgDetach(&attr)
}
// LoadPinnedProgram loads a Program from a BPF file.
//
// Requires at least Linux 4.11.
func LoadPinnedProgram(fileName string, opts *LoadPinOptions) (*Program, error) {
fd, err := internal.BPFObjGet(fileName, opts.Marshal())
if err != nil {
return nil, err
}
info, err := newProgramInfoFromFd(fd)
if err != nil {
_ = fd.Close()
return nil, fmt.Errorf("info for %s: %w", fileName, err)
}
return &Program{"", fd, filepath.Base(fileName), fileName, info.Type}, nil
}
// SanitizeName replaces all invalid characters in name with replacement.
// Passing a negative value for replacement will delete characters instead
// of replacing them. Use this to automatically generate valid names for maps
// and programs at runtime.
//
// The set of allowed characters depends on the running kernel version.
// Dots are only allowed as of kernel 5.2.
func SanitizeName(name string, replacement rune) string {
return strings.Map(func(char rune) rune {
if invalidBPFObjNameChar(char) {
return replacement
}
return char
}, name)
}
// ProgramGetNextID returns the ID of the next eBPF program.
//
// Returns ErrNotExist, if there is no next eBPF program.
func ProgramGetNextID(startID ProgramID) (ProgramID, error) {
id, err := objGetNextID(internal.BPF_PROG_GET_NEXT_ID, uint32(startID))
return ProgramID(id), err
}
// ID returns the systemwide unique ID of the program.
//
// Deprecated: use ProgramInfo.ID() instead.
func (p *Program) ID() (ProgramID, error) {
info, err := bpfGetProgInfoByFD(p.fd)
if err != nil {
return ProgramID(0), err
}
return ProgramID(info.id), nil
}
func resolveBTFType(kernel *btf.Spec, name string, progType ProgramType, attachType AttachType) (btf.Type, error) {
type match struct {
p ProgramType
a AttachType
}
var target btf.Type
var typeName, featureName string
switch (match{progType, attachType}) {
case match{LSM, AttachLSMMac}:
target = new(btf.Func)
typeName = "bpf_lsm_" + name
featureName = name + " LSM hook"
case match{Tracing, AttachTraceIter}:
target = new(btf.Func)
typeName = "bpf_iter_" + name
featureName = name + " iterator"
default:
return nil, nil
}
if kernel == nil {
var err error
kernel, err = btf.LoadKernelSpec()
if err != nil {
return nil, fmt.Errorf("load kernel spec: %w", err)
}
}
err := kernel.FindType(typeName, target)
if errors.Is(err, btf.ErrNotFound) {
return nil, &internal.UnsupportedFeatureError{
Name: featureName,
}
}
if err != nil {
return nil, fmt.Errorf("resolve BTF for %s: %w", featureName, err)
}
return target, nil
}

123
src/runtime/vendor/github.com/cilium/ebpf/run-tests.sh generated vendored Normal file
View File

@ -0,0 +1,123 @@
#!/bin/bash
# Test the current package under a different kernel.
# Requires virtme and qemu to be installed.
# Examples:
# Run all tests on a 5.4 kernel
# $ ./run-tests.sh 5.4
# Run a subset of tests:
# $ ./run-tests.sh 5.4 go test ./link
set -euo pipefail
script="$(realpath "$0")"
readonly script
# This script is a bit like a Matryoshka doll since it keeps re-executing itself
# in various different contexts:
#
# 1. invoked by the user like run-tests.sh 5.4
# 2. invoked by go test like run-tests.sh --exec-vm
# 3. invoked by init in the vm like run-tests.sh --exec-test
#
# This allows us to use all available CPU on the host machine to compile our
# code, and then only use the VM to execute the test. This is because the VM
# is usually slower at compiling than the host.
if [[ "${1:-}" = "--exec-vm" ]]; then
shift
input="$1"
shift
# Use sudo if /dev/kvm isn't accessible by the current user.
sudo=""
if [[ ! -r /dev/kvm || ! -w /dev/kvm ]]; then
sudo="sudo"
fi
readonly sudo
testdir="$(dirname "$1")"
output="$(mktemp -d)"
printf -v cmd "%q " "$@"
if [[ "$(stat -c '%t:%T' -L /proc/$$/fd/0)" == "1:3" ]]; then
# stdin is /dev/null, which doesn't play well with qemu. Use a fifo as a
# blocking substitute.
mkfifo "${output}/fake-stdin"
# Open for reading and writing to avoid blocking.
exec 0<> "${output}/fake-stdin"
rm "${output}/fake-stdin"
fi
$sudo virtme-run --kimg "${input}/bzImage" --memory 768M --pwd \
--rwdir="${testdir}=${testdir}" \
--rodir=/run/input="${input}" \
--rwdir=/run/output="${output}" \
--script-sh "PATH=\"$PATH\" \"$script\" --exec-test $cmd" \
--qemu-opts -smp 2 # need at least two CPUs for some tests
if [[ ! -e "${output}/success" ]]; then
exit 1
fi
$sudo rm -r "$output"
exit 0
elif [[ "${1:-}" = "--exec-test" ]]; then
shift
mount -t bpf bpf /sys/fs/bpf
mount -t tracefs tracefs /sys/kernel/debug/tracing
if [[ -d "/run/input/bpf" ]]; then
export KERNEL_SELFTESTS="/run/input/bpf"
fi
dmesg -C
if ! "$@"; then
dmesg
exit 1
fi
touch "/run/output/success"
exit 0
fi
readonly kernel_version="${1:-}"
if [[ -z "${kernel_version}" ]]; then
echo "Expecting kernel version as first argument"
exit 1
fi
shift
readonly kernel="linux-${kernel_version}.bz"
readonly selftests="linux-${kernel_version}-selftests-bpf.bz"
readonly input="$(mktemp -d)"
readonly tmp_dir="${TMPDIR:-/tmp}"
readonly branch="${BRANCH:-master}"
fetch() {
echo Fetching "${1}"
wget -nv -N -P "${tmp_dir}" "https://github.com/cilium/ci-kernels/raw/${branch}/${1}"
}
fetch "${kernel}"
cp "${tmp_dir}/${kernel}" "${input}/bzImage"
if fetch "${selftests}"; then
mkdir "${input}/bpf"
tar --strip-components=4 -xjf "${tmp_dir}/${selftests}" -C "${input}/bpf"
else
echo "No selftests found, disabling"
fi
args=(-v -short -coverpkg=./... -coverprofile=coverage.out -count 1 ./...)
if (( $# > 0 )); then
args=("$@")
fi
export GOFLAGS=-mod=readonly
export CGO_ENABLED=0
echo Testing on "${kernel_version}"
go test -exec "$script --exec-vm $input" "${args[@]}"
echo "Test successful on ${kernel_version}"
rm -r "${input}"

480
src/runtime/vendor/github.com/cilium/ebpf/syscalls.go generated vendored Normal file
View File

@ -0,0 +1,480 @@
package ebpf
import (
"errors"
"fmt"
"os"
"unsafe"
"github.com/cilium/ebpf/internal"
"github.com/cilium/ebpf/internal/btf"
"github.com/cilium/ebpf/internal/unix"
)
// ErrNotExist is returned when loading a non-existing map or program.
//
// Deprecated: use os.ErrNotExist instead.
var ErrNotExist = os.ErrNotExist
// invalidBPFObjNameChar returns true if char may not appear in
// a BPF object name.
func invalidBPFObjNameChar(char rune) bool {
dotAllowed := objNameAllowsDot() == nil
switch {
case char >= 'A' && char <= 'Z':
return false
case char >= 'a' && char <= 'z':
return false
case char >= '0' && char <= '9':
return false
case dotAllowed && char == '.':
return false
case char == '_':
return false
default:
return true
}
}
type bpfMapOpAttr struct {
mapFd uint32
padding uint32
key internal.Pointer
value internal.Pointer
flags uint64
}
type bpfBatchMapOpAttr struct {
inBatch internal.Pointer
outBatch internal.Pointer
keys internal.Pointer
values internal.Pointer
count uint32
mapFd uint32
elemFlags uint64
flags uint64
}
type bpfMapInfo struct {
map_type uint32 // since 4.12 1e2709769086
id uint32
key_size uint32
value_size uint32
max_entries uint32
map_flags uint32
name internal.BPFObjName // since 4.15 ad5b177bd73f
ifindex uint32 // since 4.16 52775b33bb50
btf_vmlinux_value_type_id uint32 // since 5.6 85d33df357b6
netns_dev uint64 // since 4.16 52775b33bb50
netns_ino uint64
btf_id uint32 // since 4.18 78958fca7ead
btf_key_type_id uint32 // since 4.18 9b2cf328b2ec
btf_value_type_id uint32
}
type bpfProgLoadAttr struct {
progType ProgramType
insCount uint32
instructions internal.Pointer
license internal.Pointer
logLevel uint32
logSize uint32
logBuf internal.Pointer
kernelVersion uint32 // since 4.1 2541517c32be
progFlags uint32 // since 4.11 e07b98d9bffe
progName internal.BPFObjName // since 4.15 067cae47771c
progIfIndex uint32 // since 4.15 1f6f4cb7ba21
expectedAttachType AttachType // since 4.17 5e43f899b03a
progBTFFd uint32
funcInfoRecSize uint32
funcInfo internal.Pointer
funcInfoCnt uint32
lineInfoRecSize uint32
lineInfo internal.Pointer
lineInfoCnt uint32
attachBTFID btf.TypeID
attachProgFd uint32
}
type bpfProgInfo struct {
prog_type uint32
id uint32
tag [unix.BPF_TAG_SIZE]byte
jited_prog_len uint32
xlated_prog_len uint32
jited_prog_insns internal.Pointer
xlated_prog_insns internal.Pointer
load_time uint64 // since 4.15 cb4d2b3f03d8
created_by_uid uint32
nr_map_ids uint32
map_ids internal.Pointer
name internal.BPFObjName // since 4.15 067cae47771c
ifindex uint32
gpl_compatible uint32
netns_dev uint64
netns_ino uint64
nr_jited_ksyms uint32
nr_jited_func_lens uint32
jited_ksyms internal.Pointer
jited_func_lens internal.Pointer
btf_id uint32
func_info_rec_size uint32
func_info internal.Pointer
nr_func_info uint32
nr_line_info uint32
line_info internal.Pointer
jited_line_info internal.Pointer
nr_jited_line_info uint32
line_info_rec_size uint32
jited_line_info_rec_size uint32
nr_prog_tags uint32
prog_tags internal.Pointer
run_time_ns uint64
run_cnt uint64
}
type bpfProgTestRunAttr struct {
fd uint32
retval uint32
dataSizeIn uint32
dataSizeOut uint32
dataIn internal.Pointer
dataOut internal.Pointer
repeat uint32
duration uint32
}
type bpfGetFDByIDAttr struct {
id uint32
next uint32
}
type bpfMapFreezeAttr struct {
mapFd uint32
}
type bpfObjGetNextIDAttr struct {
startID uint32
nextID uint32
openFlags uint32
}
func bpfProgLoad(attr *bpfProgLoadAttr) (*internal.FD, error) {
for {
fd, err := internal.BPF(internal.BPF_PROG_LOAD, unsafe.Pointer(attr), unsafe.Sizeof(*attr))
// As of ~4.20 the verifier can be interrupted by a signal,
// and returns EAGAIN in that case.
if errors.Is(err, unix.EAGAIN) {
continue
}
if err != nil {
return nil, err
}
return internal.NewFD(uint32(fd)), nil
}
}
func bpfProgTestRun(attr *bpfProgTestRunAttr) error {
_, err := internal.BPF(internal.BPF_PROG_TEST_RUN, unsafe.Pointer(attr), unsafe.Sizeof(*attr))
return err
}
var haveNestedMaps = internal.FeatureTest("nested maps", "4.12", func() error {
_, err := internal.BPFMapCreate(&internal.BPFMapCreateAttr{
MapType: uint32(ArrayOfMaps),
KeySize: 4,
ValueSize: 4,
MaxEntries: 1,
// Invalid file descriptor.
InnerMapFd: ^uint32(0),
})
if errors.Is(err, unix.EINVAL) {
return internal.ErrNotSupported
}
if errors.Is(err, unix.EBADF) {
return nil
}
return err
})
var haveMapMutabilityModifiers = internal.FeatureTest("read- and write-only maps", "5.2", func() error {
// This checks BPF_F_RDONLY_PROG and BPF_F_WRONLY_PROG. Since
// BPF_MAP_FREEZE appeared in 5.2 as well we don't do a separate check.
m, err := internal.BPFMapCreate(&internal.BPFMapCreateAttr{
MapType: uint32(Array),
KeySize: 4,
ValueSize: 4,
MaxEntries: 1,
Flags: unix.BPF_F_RDONLY_PROG,
})
if err != nil {
return internal.ErrNotSupported
}
_ = m.Close()
return nil
})
var haveMmapableMaps = internal.FeatureTest("mmapable maps", "5.5", func() error {
// This checks BPF_F_MMAPABLE, which appeared in 5.5 for array maps.
m, err := internal.BPFMapCreate(&internal.BPFMapCreateAttr{
MapType: uint32(Array),
KeySize: 4,
ValueSize: 4,
MaxEntries: 1,
Flags: unix.BPF_F_MMAPABLE,
})
if err != nil {
return internal.ErrNotSupported
}
_ = m.Close()
return nil
})
var haveInnerMaps = internal.FeatureTest("inner maps", "5.10", func() error {
// This checks BPF_F_INNER_MAP, which appeared in 5.10.
m, err := internal.BPFMapCreate(&internal.BPFMapCreateAttr{
MapType: uint32(Array),
KeySize: 4,
ValueSize: 4,
MaxEntries: 1,
Flags: unix.BPF_F_INNER_MAP,
})
if err != nil {
return internal.ErrNotSupported
}
_ = m.Close()
return nil
})
func bpfMapLookupElem(m *internal.FD, key, valueOut internal.Pointer) error {
fd, err := m.Value()
if err != nil {
return err
}
attr := bpfMapOpAttr{
mapFd: fd,
key: key,
value: valueOut,
}
_, err = internal.BPF(internal.BPF_MAP_LOOKUP_ELEM, unsafe.Pointer(&attr), unsafe.Sizeof(attr))
return wrapMapError(err)
}
func bpfMapLookupAndDelete(m *internal.FD, key, valueOut internal.Pointer) error {
fd, err := m.Value()
if err != nil {
return err
}
attr := bpfMapOpAttr{
mapFd: fd,
key: key,
value: valueOut,
}
_, err = internal.BPF(internal.BPF_MAP_LOOKUP_AND_DELETE_ELEM, unsafe.Pointer(&attr), unsafe.Sizeof(attr))
return wrapMapError(err)
}
func bpfMapUpdateElem(m *internal.FD, key, valueOut internal.Pointer, flags uint64) error {
fd, err := m.Value()
if err != nil {
return err
}
attr := bpfMapOpAttr{
mapFd: fd,
key: key,
value: valueOut,
flags: flags,
}
_, err = internal.BPF(internal.BPF_MAP_UPDATE_ELEM, unsafe.Pointer(&attr), unsafe.Sizeof(attr))
return wrapMapError(err)
}
func bpfMapDeleteElem(m *internal.FD, key internal.Pointer) error {
fd, err := m.Value()
if err != nil {
return err
}
attr := bpfMapOpAttr{
mapFd: fd,
key: key,
}
_, err = internal.BPF(internal.BPF_MAP_DELETE_ELEM, unsafe.Pointer(&attr), unsafe.Sizeof(attr))
return wrapMapError(err)
}
func bpfMapGetNextKey(m *internal.FD, key, nextKeyOut internal.Pointer) error {
fd, err := m.Value()
if err != nil {
return err
}
attr := bpfMapOpAttr{
mapFd: fd,
key: key,
value: nextKeyOut,
}
_, err = internal.BPF(internal.BPF_MAP_GET_NEXT_KEY, unsafe.Pointer(&attr), unsafe.Sizeof(attr))
return wrapMapError(err)
}
func objGetNextID(cmd internal.BPFCmd, start uint32) (uint32, error) {
attr := bpfObjGetNextIDAttr{
startID: start,
}
_, err := internal.BPF(cmd, unsafe.Pointer(&attr), unsafe.Sizeof(attr))
return attr.nextID, err
}
func bpfMapBatch(cmd internal.BPFCmd, m *internal.FD, inBatch, outBatch, keys, values internal.Pointer, count uint32, opts *BatchOptions) (uint32, error) {
fd, err := m.Value()
if err != nil {
return 0, err
}
attr := bpfBatchMapOpAttr{
inBatch: inBatch,
outBatch: outBatch,
keys: keys,
values: values,
count: count,
mapFd: fd,
}
if opts != nil {
attr.elemFlags = opts.ElemFlags
attr.flags = opts.Flags
}
_, err = internal.BPF(cmd, unsafe.Pointer(&attr), unsafe.Sizeof(attr))
// always return count even on an error, as things like update might partially be fulfilled.
return attr.count, wrapMapError(err)
}
func wrapMapError(err error) error {
if err == nil {
return nil
}
if errors.Is(err, unix.ENOENT) {
return internal.SyscallError(ErrKeyNotExist, unix.ENOENT)
}
if errors.Is(err, unix.EEXIST) {
return internal.SyscallError(ErrKeyExist, unix.EEXIST)
}
if errors.Is(err, unix.ENOTSUPP) {
return internal.SyscallError(ErrNotSupported, unix.ENOTSUPP)
}
return err
}
func bpfMapFreeze(m *internal.FD) error {
fd, err := m.Value()
if err != nil {
return err
}
attr := bpfMapFreezeAttr{
mapFd: fd,
}
_, err = internal.BPF(internal.BPF_MAP_FREEZE, unsafe.Pointer(&attr), unsafe.Sizeof(attr))
return err
}
func bpfGetProgInfoByFD(fd *internal.FD) (*bpfProgInfo, error) {
var info bpfProgInfo
if err := internal.BPFObjGetInfoByFD(fd, unsafe.Pointer(&info), unsafe.Sizeof(info)); err != nil {
return nil, fmt.Errorf("can't get program info: %w", err)
}
return &info, nil
}
func bpfGetMapInfoByFD(fd *internal.FD) (*bpfMapInfo, error) {
var info bpfMapInfo
err := internal.BPFObjGetInfoByFD(fd, unsafe.Pointer(&info), unsafe.Sizeof(info))
if err != nil {
return nil, fmt.Errorf("can't get map info: %w", err)
}
return &info, nil
}
var haveObjName = internal.FeatureTest("object names", "4.15", func() error {
attr := internal.BPFMapCreateAttr{
MapType: uint32(Array),
KeySize: 4,
ValueSize: 4,
MaxEntries: 1,
MapName: internal.NewBPFObjName("feature_test"),
}
fd, err := internal.BPFMapCreate(&attr)
if err != nil {
return internal.ErrNotSupported
}
_ = fd.Close()
return nil
})
var objNameAllowsDot = internal.FeatureTest("dot in object names", "5.2", func() error {
if err := haveObjName(); err != nil {
return err
}
attr := internal.BPFMapCreateAttr{
MapType: uint32(Array),
KeySize: 4,
ValueSize: 4,
MaxEntries: 1,
MapName: internal.NewBPFObjName(".test"),
}
fd, err := internal.BPFMapCreate(&attr)
if err != nil {
return internal.ErrNotSupported
}
_ = fd.Close()
return nil
})
var haveBatchAPI = internal.FeatureTest("map batch api", "5.6", func() error {
var maxEntries uint32 = 2
attr := internal.BPFMapCreateAttr{
MapType: uint32(Hash),
KeySize: 4,
ValueSize: 4,
MaxEntries: maxEntries,
}
fd, err := internal.BPFMapCreate(&attr)
if err != nil {
return internal.ErrNotSupported
}
defer fd.Close()
keys := []uint32{1, 2}
values := []uint32{3, 4}
kp, _ := marshalPtr(keys, 8)
vp, _ := marshalPtr(values, 8)
nilPtr := internal.NewPointer(nil)
_, err = bpfMapBatch(internal.BPF_MAP_UPDATE_BATCH, fd, nilPtr, nilPtr, kp, vp, maxEntries, nil)
if err != nil {
return internal.ErrNotSupported
}
return nil
})
func bpfObjGetFDByID(cmd internal.BPFCmd, id uint32) (*internal.FD, error) {
attr := bpfGetFDByIDAttr{
id: id,
}
ptr, err := internal.BPF(cmd, unsafe.Pointer(&attr), unsafe.Sizeof(attr))
return internal.NewFD(uint32(ptr)), err
}

248
src/runtime/vendor/github.com/cilium/ebpf/types.go generated vendored Normal file
View File

@ -0,0 +1,248 @@
package ebpf
import (
"github.com/cilium/ebpf/internal/unix"
)
//go:generate stringer -output types_string.go -type=MapType,ProgramType,AttachType,PinType
// MapType indicates the type map structure
// that will be initialized in the kernel.
type MapType uint32
// All the various map types that can be created
const (
UnspecifiedMap MapType = iota
// Hash is a hash map
Hash
// Array is an array map
Array
// ProgramArray - A program array map is a special kind of array map whose map
// values contain only file descriptors referring to other eBPF
// programs. Thus, both the key_size and value_size must be
// exactly four bytes. This map is used in conjunction with the
// TailCall helper.
ProgramArray
// PerfEventArray - A perf event array is used in conjunction with PerfEventRead
// and PerfEventOutput calls, to read the raw bpf_perf_data from the registers.
PerfEventArray
// PerCPUHash - This data structure is useful for people who have high performance
// network needs and can reconcile adds at the end of some cycle, so that
// hashes can be lock free without the use of XAdd, which can be costly.
PerCPUHash
// PerCPUArray - This data structure is useful for people who have high performance
// network needs and can reconcile adds at the end of some cycle, so that
// hashes can be lock free without the use of XAdd, which can be costly.
// Each CPU gets a copy of this hash, the contents of all of which can be reconciled
// later.
PerCPUArray
// StackTrace - This holds whole user and kernel stack traces, it can be retrieved with
// GetStackID
StackTrace
// CGroupArray - This is a very niche structure used to help SKBInCGroup determine
// if an skb is from a socket belonging to a specific cgroup
CGroupArray
// LRUHash - This allows you to create a small hash structure that will purge the
// least recently used items rather than thow an error when you run out of memory
LRUHash
// LRUCPUHash - This is NOT like PerCPUHash, this structure is shared among the CPUs,
// it has more to do with including the CPU id with the LRU calculation so that if a
// particular CPU is using a value over-and-over again, then it will be saved, but if
// a value is being retrieved a lot but sparsely across CPUs it is not as important, basically
// giving weight to CPU locality over overall usage.
LRUCPUHash
// LPMTrie - This is an implementation of Longest-Prefix-Match Trie structure. It is useful,
// for storing things like IP addresses which can be bit masked allowing for keys of differing
// values to refer to the same reference based on their masks. See wikipedia for more details.
LPMTrie
// ArrayOfMaps - Each item in the array is another map. The inner map mustn't be a map of maps
// itself.
ArrayOfMaps
// HashOfMaps - Each item in the hash map is another map. The inner map mustn't be a map of maps
// itself.
HashOfMaps
// DevMap - Specialized map to store references to network devices.
DevMap
// SockMap - Specialized map to store references to sockets.
SockMap
// CPUMap - Specialized map to store references to CPUs.
CPUMap
// XSKMap - Specialized map for XDP programs to store references to open sockets.
XSKMap
// SockHash - Specialized hash to store references to sockets.
SockHash
// CGroupStorage - Special map for CGroups.
CGroupStorage
// ReusePortSockArray - Specialized map to store references to sockets that can be reused.
ReusePortSockArray
// PerCPUCGroupStorage - Special per CPU map for CGroups.
PerCPUCGroupStorage
// Queue - FIFO storage for BPF programs.
Queue
// Stack - LIFO storage for BPF programs.
Stack
// SkStorage - Specialized map for local storage at SK for BPF programs.
SkStorage
// DevMapHash - Hash-based indexing scheme for references to network devices.
DevMapHash
StructOpts
RingBuf
InodeStorage
TaskStorage
)
// hasPerCPUValue returns true if the Map stores a value per CPU.
func (mt MapType) hasPerCPUValue() bool {
return mt == PerCPUHash || mt == PerCPUArray || mt == LRUCPUHash
}
// canStoreMap returns true if the map type accepts a map fd
// for update and returns a map id for lookup.
func (mt MapType) canStoreMap() bool {
return mt == ArrayOfMaps || mt == HashOfMaps
}
// canStoreProgram returns true if the map type accepts a program fd
// for update and returns a program id for lookup.
func (mt MapType) canStoreProgram() bool {
return mt == ProgramArray
}
// ProgramType of the eBPF program
type ProgramType uint32
// eBPF program types
const (
UnspecifiedProgram ProgramType = iota
SocketFilter
Kprobe
SchedCLS
SchedACT
TracePoint
XDP
PerfEvent
CGroupSKB
CGroupSock
LWTIn
LWTOut
LWTXmit
SockOps
SkSKB
CGroupDevice
SkMsg
RawTracepoint
CGroupSockAddr
LWTSeg6Local
LircMode2
SkReuseport
FlowDissector
CGroupSysctl
RawTracepointWritable
CGroupSockopt
Tracing
StructOps
Extension
LSM
SkLookup
)
// AttachType of the eBPF program, needed to differentiate allowed context accesses in
// some newer program types like CGroupSockAddr. Should be set to AttachNone if not required.
// Will cause invalid argument (EINVAL) at program load time if set incorrectly.
type AttachType uint32
// AttachNone is an alias for AttachCGroupInetIngress for readability reasons.
const AttachNone AttachType = 0
const (
AttachCGroupInetIngress AttachType = iota
AttachCGroupInetEgress
AttachCGroupInetSockCreate
AttachCGroupSockOps
AttachSkSKBStreamParser
AttachSkSKBStreamVerdict
AttachCGroupDevice
AttachSkMsgVerdict
AttachCGroupInet4Bind
AttachCGroupInet6Bind
AttachCGroupInet4Connect
AttachCGroupInet6Connect
AttachCGroupInet4PostBind
AttachCGroupInet6PostBind
AttachCGroupUDP4Sendmsg
AttachCGroupUDP6Sendmsg
AttachLircMode2
AttachFlowDissector
AttachCGroupSysctl
AttachCGroupUDP4Recvmsg
AttachCGroupUDP6Recvmsg
AttachCGroupGetsockopt
AttachCGroupSetsockopt
AttachTraceRawTp
AttachTraceFEntry
AttachTraceFExit
AttachModifyReturn
AttachLSMMac
AttachTraceIter
AttachCgroupInet4GetPeername
AttachCgroupInet6GetPeername
AttachCgroupInet4GetSockname
AttachCgroupInet6GetSockname
AttachXDPDevMap
AttachCgroupInetSockRelease
AttachXDPCPUMap
AttachSkLookup
AttachXDP
)
// AttachFlags of the eBPF program used in BPF_PROG_ATTACH command
type AttachFlags uint32
// PinType determines whether a map is pinned into a BPFFS.
type PinType int
// Valid pin types.
//
// Mirrors enum libbpf_pin_type.
const (
PinNone PinType = iota
// Pin an object by using its name as the filename.
PinByName
)
// LoadPinOptions control how a pinned object is loaded.
type LoadPinOptions struct {
// Request a read-only or write-only object. The default is a read-write
// object. Only one of the flags may be set.
ReadOnly bool
WriteOnly bool
// Raw flags for the syscall. Other fields of this struct take precedence.
Flags uint32
}
// Marshal returns a value suitable for BPF_OBJ_GET syscall file_flags parameter.
func (lpo *LoadPinOptions) Marshal() uint32 {
if lpo == nil {
return 0
}
flags := lpo.Flags
if lpo.ReadOnly {
flags |= unix.BPF_F_RDONLY
}
if lpo.WriteOnly {
flags |= unix.BPF_F_WRONLY
}
return flags
}
// BatchOptions batch map operations options
//
// Mirrors libbpf struct bpf_map_batch_opts
// Currently BPF_F_FLAG is the only supported
// flag (for ElemFlags).
type BatchOptions struct {
ElemFlags uint64
Flags uint64
}

View File

@ -0,0 +1,172 @@
// Code generated by "stringer -output types_string.go -type=MapType,ProgramType,AttachType,PinType"; DO NOT EDIT.
package ebpf
import "strconv"
func _() {
// An "invalid array index" compiler error signifies that the constant values have changed.
// Re-run the stringer command to generate them again.
var x [1]struct{}
_ = x[UnspecifiedMap-0]
_ = x[Hash-1]
_ = x[Array-2]
_ = x[ProgramArray-3]
_ = x[PerfEventArray-4]
_ = x[PerCPUHash-5]
_ = x[PerCPUArray-6]
_ = x[StackTrace-7]
_ = x[CGroupArray-8]
_ = x[LRUHash-9]
_ = x[LRUCPUHash-10]
_ = x[LPMTrie-11]
_ = x[ArrayOfMaps-12]
_ = x[HashOfMaps-13]
_ = x[DevMap-14]
_ = x[SockMap-15]
_ = x[CPUMap-16]
_ = x[XSKMap-17]
_ = x[SockHash-18]
_ = x[CGroupStorage-19]
_ = x[ReusePortSockArray-20]
_ = x[PerCPUCGroupStorage-21]
_ = x[Queue-22]
_ = x[Stack-23]
_ = x[SkStorage-24]
_ = x[DevMapHash-25]
_ = x[StructOpts-26]
_ = x[RingBuf-27]
_ = x[InodeStorage-28]
_ = x[TaskStorage-29]
}
const _MapType_name = "UnspecifiedMapHashArrayProgramArrayPerfEventArrayPerCPUHashPerCPUArrayStackTraceCGroupArrayLRUHashLRUCPUHashLPMTrieArrayOfMapsHashOfMapsDevMapSockMapCPUMapXSKMapSockHashCGroupStorageReusePortSockArrayPerCPUCGroupStorageQueueStackSkStorageDevMapHashStructOptsRingBufInodeStorageTaskStorage"
var _MapType_index = [...]uint16{0, 14, 18, 23, 35, 49, 59, 70, 80, 91, 98, 108, 115, 126, 136, 142, 149, 155, 161, 169, 182, 200, 219, 224, 229, 238, 248, 258, 265, 277, 288}
func (i MapType) String() string {
if i >= MapType(len(_MapType_index)-1) {
return "MapType(" + strconv.FormatInt(int64(i), 10) + ")"
}
return _MapType_name[_MapType_index[i]:_MapType_index[i+1]]
}
func _() {
// An "invalid array index" compiler error signifies that the constant values have changed.
// Re-run the stringer command to generate them again.
var x [1]struct{}
_ = x[UnspecifiedProgram-0]
_ = x[SocketFilter-1]
_ = x[Kprobe-2]
_ = x[SchedCLS-3]
_ = x[SchedACT-4]
_ = x[TracePoint-5]
_ = x[XDP-6]
_ = x[PerfEvent-7]
_ = x[CGroupSKB-8]
_ = x[CGroupSock-9]
_ = x[LWTIn-10]
_ = x[LWTOut-11]
_ = x[LWTXmit-12]
_ = x[SockOps-13]
_ = x[SkSKB-14]
_ = x[CGroupDevice-15]
_ = x[SkMsg-16]
_ = x[RawTracepoint-17]
_ = x[CGroupSockAddr-18]
_ = x[LWTSeg6Local-19]
_ = x[LircMode2-20]
_ = x[SkReuseport-21]
_ = x[FlowDissector-22]
_ = x[CGroupSysctl-23]
_ = x[RawTracepointWritable-24]
_ = x[CGroupSockopt-25]
_ = x[Tracing-26]
_ = x[StructOps-27]
_ = x[Extension-28]
_ = x[LSM-29]
_ = x[SkLookup-30]
}
const _ProgramType_name = "UnspecifiedProgramSocketFilterKprobeSchedCLSSchedACTTracePointXDPPerfEventCGroupSKBCGroupSockLWTInLWTOutLWTXmitSockOpsSkSKBCGroupDeviceSkMsgRawTracepointCGroupSockAddrLWTSeg6LocalLircMode2SkReuseportFlowDissectorCGroupSysctlRawTracepointWritableCGroupSockoptTracingStructOpsExtensionLSMSkLookup"
var _ProgramType_index = [...]uint16{0, 18, 30, 36, 44, 52, 62, 65, 74, 83, 93, 98, 104, 111, 118, 123, 135, 140, 153, 167, 179, 188, 199, 212, 224, 245, 258, 265, 274, 283, 286, 294}
func (i ProgramType) String() string {
if i >= ProgramType(len(_ProgramType_index)-1) {
return "ProgramType(" + strconv.FormatInt(int64(i), 10) + ")"
}
return _ProgramType_name[_ProgramType_index[i]:_ProgramType_index[i+1]]
}
func _() {
// An "invalid array index" compiler error signifies that the constant values have changed.
// Re-run the stringer command to generate them again.
var x [1]struct{}
_ = x[AttachNone-0]
_ = x[AttachCGroupInetIngress-0]
_ = x[AttachCGroupInetEgress-1]
_ = x[AttachCGroupInetSockCreate-2]
_ = x[AttachCGroupSockOps-3]
_ = x[AttachSkSKBStreamParser-4]
_ = x[AttachSkSKBStreamVerdict-5]
_ = x[AttachCGroupDevice-6]
_ = x[AttachSkMsgVerdict-7]
_ = x[AttachCGroupInet4Bind-8]
_ = x[AttachCGroupInet6Bind-9]
_ = x[AttachCGroupInet4Connect-10]
_ = x[AttachCGroupInet6Connect-11]
_ = x[AttachCGroupInet4PostBind-12]
_ = x[AttachCGroupInet6PostBind-13]
_ = x[AttachCGroupUDP4Sendmsg-14]
_ = x[AttachCGroupUDP6Sendmsg-15]
_ = x[AttachLircMode2-16]
_ = x[AttachFlowDissector-17]
_ = x[AttachCGroupSysctl-18]
_ = x[AttachCGroupUDP4Recvmsg-19]
_ = x[AttachCGroupUDP6Recvmsg-20]
_ = x[AttachCGroupGetsockopt-21]
_ = x[AttachCGroupSetsockopt-22]
_ = x[AttachTraceRawTp-23]
_ = x[AttachTraceFEntry-24]
_ = x[AttachTraceFExit-25]
_ = x[AttachModifyReturn-26]
_ = x[AttachLSMMac-27]
_ = x[AttachTraceIter-28]
_ = x[AttachCgroupInet4GetPeername-29]
_ = x[AttachCgroupInet6GetPeername-30]
_ = x[AttachCgroupInet4GetSockname-31]
_ = x[AttachCgroupInet6GetSockname-32]
_ = x[AttachXDPDevMap-33]
_ = x[AttachCgroupInetSockRelease-34]
_ = x[AttachXDPCPUMap-35]
_ = x[AttachSkLookup-36]
_ = x[AttachXDP-37]
}
const _AttachType_name = "AttachNoneAttachCGroupInetEgressAttachCGroupInetSockCreateAttachCGroupSockOpsAttachSkSKBStreamParserAttachSkSKBStreamVerdictAttachCGroupDeviceAttachSkMsgVerdictAttachCGroupInet4BindAttachCGroupInet6BindAttachCGroupInet4ConnectAttachCGroupInet6ConnectAttachCGroupInet4PostBindAttachCGroupInet6PostBindAttachCGroupUDP4SendmsgAttachCGroupUDP6SendmsgAttachLircMode2AttachFlowDissectorAttachCGroupSysctlAttachCGroupUDP4RecvmsgAttachCGroupUDP6RecvmsgAttachCGroupGetsockoptAttachCGroupSetsockoptAttachTraceRawTpAttachTraceFEntryAttachTraceFExitAttachModifyReturnAttachLSMMacAttachTraceIterAttachCgroupInet4GetPeernameAttachCgroupInet6GetPeernameAttachCgroupInet4GetSocknameAttachCgroupInet6GetSocknameAttachXDPDevMapAttachCgroupInetSockReleaseAttachXDPCPUMapAttachSkLookupAttachXDP"
var _AttachType_index = [...]uint16{0, 10, 32, 58, 77, 100, 124, 142, 160, 181, 202, 226, 250, 275, 300, 323, 346, 361, 380, 398, 421, 444, 466, 488, 504, 521, 537, 555, 567, 582, 610, 638, 666, 694, 709, 736, 751, 765, 774}
func (i AttachType) String() string {
if i >= AttachType(len(_AttachType_index)-1) {
return "AttachType(" + strconv.FormatInt(int64(i), 10) + ")"
}
return _AttachType_name[_AttachType_index[i]:_AttachType_index[i+1]]
}
func _() {
// An "invalid array index" compiler error signifies that the constant values have changed.
// Re-run the stringer command to generate them again.
var x [1]struct{}
_ = x[PinNone-0]
_ = x[PinByName-1]
}
const _PinType_name = "PinNonePinByName"
var _PinType_index = [...]uint8{0, 7, 16}
func (i PinType) String() string {
if i < 0 || i >= PinType(len(_PinType_index)-1) {
return "PinType(" + strconv.FormatInt(int64(i), 10) + ")"
}
return _PinType_name[_PinType_index[i]:_PinType_index[i+1]]
}

View File

@ -0,0 +1,19 @@
# Copyright (C) 2017 SUSE LLC. All rights reserved.
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.
language: go
go:
- 1.7.x
- 1.8.x
- tip
os:
- linux
- osx
script:
- go test -cover -v ./...
notifications:
email: false

View File

@ -0,0 +1,28 @@
Copyright (C) 2014-2015 Docker Inc & Go Authors. All rights reserved.
Copyright (C) 2017 SUSE LLC. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
* Neither the name of Google Inc. nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@ -0,0 +1,65 @@
## `filepath-securejoin` ##
[![Build Status](https://travis-ci.org/cyphar/filepath-securejoin.svg?branch=master)](https://travis-ci.org/cyphar/filepath-securejoin)
An implementation of `SecureJoin`, a [candidate for inclusion in the Go
standard library][go#20126]. The purpose of this function is to be a "secure"
alternative to `filepath.Join`, and in particular it provides certain
guarantees that are not provided by `filepath.Join`.
This is the function prototype:
```go
func SecureJoin(root, unsafePath string) (string, error)
```
This library **guarantees** the following:
* If no error is set, the resulting string **must** be a child path of
`SecureJoin` and will not contain any symlink path components (they will all
be expanded).
* When expanding symlinks, all symlink path components **must** be resolved
relative to the provided root. In particular, this can be considered a
userspace implementation of how `chroot(2)` operates on file paths. Note that
these symlinks will **not** be expanded lexically (`filepath.Clean` is not
called on the input before processing).
* Non-existant path components are unaffected by `SecureJoin` (similar to
`filepath.EvalSymlinks`'s semantics).
* The returned path will always be `filepath.Clean`ed and thus not contain any
`..` components.
A (trivial) implementation of this function on GNU/Linux systems could be done
with the following (note that this requires root privileges and is far more
opaque than the implementation in this library, and also requires that
`readlink` is inside the `root` path):
```go
package securejoin
import (
"os/exec"
"path/filepath"
)
func SecureJoin(root, unsafePath string) (string, error) {
unsafePath = string(filepath.Separator) + unsafePath
cmd := exec.Command("chroot", root,
"readlink", "--canonicalize-missing", "--no-newline", unsafePath)
output, err := cmd.CombinedOutput()
if err != nil {
return "", err
}
expanded := string(output)
return filepath.Join(root, expanded), nil
}
```
[go#20126]: https://github.com/golang/go/issues/20126
### License ###
The license of this project is the same as Go, which is a BSD 3-clause license
available in the `LICENSE` file.

View File

@ -0,0 +1 @@
0.2.2

View File

@ -0,0 +1,134 @@
// Copyright (C) 2014-2015 Docker Inc & Go Authors. All rights reserved.
// Copyright (C) 2017 SUSE LLC. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package securejoin is an implementation of the hopefully-soon-to-be-included
// SecureJoin helper that is meant to be part of the "path/filepath" package.
// The purpose of this project is to provide a PoC implementation to make the
// SecureJoin proposal (https://github.com/golang/go/issues/20126) more
// tangible.
package securejoin
import (
"bytes"
"os"
"path/filepath"
"strings"
"syscall"
"github.com/pkg/errors"
)
// ErrSymlinkLoop is returned by SecureJoinVFS when too many symlinks have been
// evaluated in attempting to securely join the two given paths.
var ErrSymlinkLoop = errors.Wrap(syscall.ELOOP, "secure join")
// IsNotExist tells you if err is an error that implies that either the path
// accessed does not exist (or path components don't exist). This is
// effectively a more broad version of os.IsNotExist.
func IsNotExist(err error) bool {
// If it's a bone-fide ENOENT just bail.
if os.IsNotExist(errors.Cause(err)) {
return true
}
// Check that it's not actually an ENOTDIR, which in some cases is a more
// convoluted case of ENOENT (usually involving weird paths).
var errno error
switch err := errors.Cause(err).(type) {
case *os.PathError:
errno = err.Err
case *os.LinkError:
errno = err.Err
case *os.SyscallError:
errno = err.Err
}
return errno == syscall.ENOTDIR || errno == syscall.ENOENT
}
// SecureJoinVFS joins the two given path components (similar to Join) except
// that the returned path is guaranteed to be scoped inside the provided root
// path (when evaluated). Any symbolic links in the path are evaluated with the
// given root treated as the root of the filesystem, similar to a chroot. The
// filesystem state is evaluated through the given VFS interface (if nil, the
// standard os.* family of functions are used).
//
// Note that the guarantees provided by this function only apply if the path
// components in the returned string are not modified (in other words are not
// replaced with symlinks on the filesystem) after this function has returned.
// Such a symlink race is necessarily out-of-scope of SecureJoin.
func SecureJoinVFS(root, unsafePath string, vfs VFS) (string, error) {
// Use the os.* VFS implementation if none was specified.
if vfs == nil {
vfs = osVFS{}
}
var path bytes.Buffer
n := 0
for unsafePath != "" {
if n > 255 {
return "", ErrSymlinkLoop
}
// Next path component, p.
i := strings.IndexRune(unsafePath, filepath.Separator)
var p string
if i == -1 {
p, unsafePath = unsafePath, ""
} else {
p, unsafePath = unsafePath[:i], unsafePath[i+1:]
}
// Create a cleaned path, using the lexical semantics of /../a, to
// create a "scoped" path component which can safely be joined to fullP
// for evaluation. At this point, path.String() doesn't contain any
// symlink components.
cleanP := filepath.Clean(string(filepath.Separator) + path.String() + p)
if cleanP == string(filepath.Separator) {
path.Reset()
continue
}
fullP := filepath.Clean(root + cleanP)
// Figure out whether the path is a symlink.
fi, err := vfs.Lstat(fullP)
if err != nil && !IsNotExist(err) {
return "", err
}
// Treat non-existent path components the same as non-symlinks (we
// can't do any better here).
if IsNotExist(err) || fi.Mode()&os.ModeSymlink == 0 {
path.WriteString(p)
path.WriteRune(filepath.Separator)
continue
}
// Only increment when we actually dereference a link.
n++
// It's a symlink, expand it by prepending it to the yet-unparsed path.
dest, err := vfs.Readlink(fullP)
if err != nil {
return "", err
}
// Absolute symlinks reset any work we've already done.
if filepath.IsAbs(dest) {
path.Reset()
}
unsafePath = dest + string(filepath.Separator) + unsafePath
}
// We have to clean path.String() here because it may contain '..'
// components that are entirely lexical, but would be misleading otherwise.
// And finally do a final clean to ensure that root is also lexically
// clean.
fullP := filepath.Clean(string(filepath.Separator) + path.String())
return filepath.Clean(root + fullP), nil
}
// SecureJoin is a wrapper around SecureJoinVFS that just uses the os.* library
// of functions as the VFS. If in doubt, use this function over SecureJoinVFS.
func SecureJoin(root, unsafePath string) (string, error) {
return SecureJoinVFS(root, unsafePath, nil)
}

View File

@ -0,0 +1 @@
github.com/pkg/errors v0.8.0

View File

@ -0,0 +1,41 @@
// Copyright (C) 2017 SUSE LLC. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package securejoin
import "os"
// In future this should be moved into a separate package, because now there
// are several projects (umoci and go-mtree) that are using this sort of
// interface.
// VFS is the minimal interface necessary to use SecureJoinVFS. A nil VFS is
// equivalent to using the standard os.* family of functions. This is mainly
// used for the purposes of mock testing, but also can be used to otherwise use
// SecureJoin with VFS-like system.
type VFS interface {
// Lstat returns a FileInfo describing the named file. If the file is a
// symbolic link, the returned FileInfo describes the symbolic link. Lstat
// makes no attempt to follow the link. These semantics are identical to
// os.Lstat.
Lstat(name string) (os.FileInfo, error)
// Readlink returns the destination of the named symbolic link. These
// semantics are identical to os.Readlink.
Readlink(name string) (string, error)
}
// osVFS is the "nil" VFS, in that it just passes everything through to the os
// module.
type osVFS struct{}
// Lstat returns a FileInfo describing the named file. If the file is a
// symbolic link, the returned FileInfo describes the symbolic link. Lstat
// makes no attempt to follow the link. These semantics are identical to
// os.Lstat.
func (o osVFS) Lstat(name string) (os.FileInfo, error) { return os.Lstat(name) }
// Readlink returns the destination of the named symbolic link. These
// semantics are identical to os.Readlink.
func (o osVFS) Readlink(name string) (string, error) { return os.Readlink(name) }

View File

@ -0,0 +1,61 @@
// +build linux
package cgroups
import (
"github.com/opencontainers/runc/libcontainer/configs"
)
type Manager interface {
// Apply creates a cgroup, if not yet created, and adds a process
// with the specified pid into that cgroup. A special value of -1
// can be used to merely create a cgroup.
Apply(pid int) error
// GetPids returns the PIDs of all processes inside the cgroup.
GetPids() ([]int, error)
// GetAllPids returns the PIDs of all processes inside the cgroup
// any all its sub-cgroups.
GetAllPids() ([]int, error)
// GetStats returns cgroups statistics.
GetStats() (*Stats, error)
// Freeze sets the freezer cgroup to the specified state.
Freeze(state configs.FreezerState) error
// Destroy removes cgroup.
Destroy() error
// Path returns a cgroup path to the specified controller/subsystem.
// For cgroupv2, the argument is unused and can be empty.
Path(string) string
// Set sets cgroup resources parameters/limits. If the argument is nil,
// the resources specified during Manager creation (or the previous call
// to Set) are used.
Set(r *configs.Resources) error
// GetPaths returns cgroup path(s) to save in a state file in order to
// restore later.
//
// For cgroup v1, a key is cgroup subsystem name, and the value is the
// path to the cgroup for this subsystem.
//
// For cgroup v2 unified hierarchy, a key is "", and the value is the
// unified path.
GetPaths() map[string]string
// GetCgroups returns the cgroup data as configured.
GetCgroups() (*configs.Cgroup, error)
// GetFreezerState retrieves the current FreezerState of the cgroup.
GetFreezerState() (configs.FreezerState, error)
// Exists returns whether the cgroup path exists or not.
Exists() bool
// OOMKillCount reports OOM kill count for the cgroup.
OOMKillCount() (uint64, error)
}

View File

@ -0,0 +1,3 @@
// +build !linux
package cgroups

View File

@ -0,0 +1,382 @@
// +build linux
// SPDX-License-Identifier: Apache-2.0
/*
* Copyright (C) 2020 Aleksa Sarai <cyphar@cyphar.com>
* Copyright (C) 2020 SUSE LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package devices
import (
"bufio"
"io"
"regexp"
"sort"
"strconv"
"github.com/opencontainers/runc/libcontainer/devices"
"github.com/pkg/errors"
)
// deviceMeta is a Rule without the Allow or Permissions fields, and no
// wildcard-type support. It's effectively the "match" portion of a metadata
// rule, for the purposes of our emulation.
type deviceMeta struct {
node devices.Type
major int64
minor int64
}
// deviceRule is effectively the tuple (deviceMeta, Permissions).
type deviceRule struct {
meta deviceMeta
perms devices.Permissions
}
// deviceRules is a mapping of device metadata rules to the associated
// permissions in the ruleset.
type deviceRules map[deviceMeta]devices.Permissions
func (r deviceRules) orderedEntries() []deviceRule {
var rules []deviceRule
for meta, perms := range r {
rules = append(rules, deviceRule{meta: meta, perms: perms})
}
sort.Slice(rules, func(i, j int) bool {
// Sort by (major, minor, type).
a, b := rules[i].meta, rules[j].meta
return a.major < b.major ||
(a.major == b.major && a.minor < b.minor) ||
(a.major == b.major && a.minor == b.minor && a.node < b.node)
})
return rules
}
type Emulator struct {
defaultAllow bool
rules deviceRules
}
func (e *Emulator) IsBlacklist() bool {
return e.defaultAllow
}
func (e *Emulator) IsAllowAll() bool {
return e.IsBlacklist() && len(e.rules) == 0
}
var devicesListRegexp = regexp.MustCompile(`^([abc])\s+(\d+|\*):(\d+|\*)\s+([rwm]+)$`)
func parseLine(line string) (*deviceRule, error) {
matches := devicesListRegexp.FindStringSubmatch(line)
if matches == nil {
return nil, errors.Errorf("line doesn't match devices.list format")
}
var (
rule deviceRule
node = matches[1]
major = matches[2]
minor = matches[3]
perms = matches[4]
)
// Parse the node type.
switch node {
case "a":
// Super-special case -- "a" always means every device with every
// access mode. In fact, for devices.list this actually indicates that
// the cgroup is in black-list mode.
// TODO: Double-check that the entire file is "a *:* rwm".
return nil, nil
case "b":
rule.meta.node = devices.BlockDevice
case "c":
rule.meta.node = devices.CharDevice
default:
// Should never happen!
return nil, errors.Errorf("unknown device type %q", node)
}
// Parse the major number.
if major == "*" {
rule.meta.major = devices.Wildcard
} else {
val, err := strconv.ParseUint(major, 10, 32)
if err != nil {
return nil, errors.Wrap(err, "parse major number")
}
rule.meta.major = int64(val)
}
// Parse the minor number.
if minor == "*" {
rule.meta.minor = devices.Wildcard
} else {
val, err := strconv.ParseUint(minor, 10, 32)
if err != nil {
return nil, errors.Wrap(err, "parse minor number")
}
rule.meta.minor = int64(val)
}
// Parse the access permissions.
rule.perms = devices.Permissions(perms)
if !rule.perms.IsValid() || rule.perms.IsEmpty() {
// Should never happen!
return nil, errors.Errorf("parse access mode: contained unknown modes or is empty: %q", perms)
}
return &rule, nil
}
func (e *Emulator) addRule(rule deviceRule) error {
if e.rules == nil {
e.rules = make(map[deviceMeta]devices.Permissions)
}
// Merge with any pre-existing permissions.
oldPerms := e.rules[rule.meta]
newPerms := rule.perms.Union(oldPerms)
e.rules[rule.meta] = newPerms
return nil
}
func (e *Emulator) rmRule(rule deviceRule) error {
// Give an error if any of the permissions requested to be removed are
// present in a partially-matching wildcard rule, because such rules will
// be ignored by cgroupv1.
//
// This is a diversion from cgroupv1, but is necessary to avoid leading
// users into a false sense of security. cgroupv1 will silently(!) ignore
// requests to remove partial exceptions, but we really shouldn't do that.
//
// It may seem like we could just "split" wildcard rules which hit this
// issue, but unfortunately there are 2^32 possible major and minor
// numbers, which would exhaust kernel memory quickly if we did this. Not
// to mention it'd be really slow (the kernel side is implemented as a
// linked-list of exceptions).
for _, partialMeta := range []deviceMeta{
{node: rule.meta.node, major: devices.Wildcard, minor: rule.meta.minor},
{node: rule.meta.node, major: rule.meta.major, minor: devices.Wildcard},
{node: rule.meta.node, major: devices.Wildcard, minor: devices.Wildcard},
} {
// This wildcard rule is equivalent to the requested rule, so skip it.
if rule.meta == partialMeta {
continue
}
// Only give an error if the set of permissions overlap.
partialPerms := e.rules[partialMeta]
if !partialPerms.Intersection(rule.perms).IsEmpty() {
return errors.Errorf("requested rule [%v %v] not supported by devices cgroupv1 (cannot punch hole in existing wildcard rule [%v %v])", rule.meta, rule.perms, partialMeta, partialPerms)
}
}
// Subtract all of the permissions listed from the full match rule. If the
// rule didn't exist, all of this is a no-op.
newPerms := e.rules[rule.meta].Difference(rule.perms)
if newPerms.IsEmpty() {
delete(e.rules, rule.meta)
} else {
e.rules[rule.meta] = newPerms
}
// TODO: The actual cgroup code doesn't care if an exception didn't exist
// during removal, so not erroring out here is /accurate/ but quite
// worrying. Maybe we should do additional validation, but again we
// have to worry about backwards-compatibility.
return nil
}
func (e *Emulator) allow(rule *deviceRule) error {
// This cgroup is configured as a black-list. Reset the entire emulator,
// and put is into black-list mode.
if rule == nil || rule.meta.node == devices.WildcardDevice {
*e = Emulator{
defaultAllow: true,
rules: nil,
}
return nil
}
var err error
if e.defaultAllow {
err = errors.Wrap(e.rmRule(*rule), "remove 'deny' exception")
} else {
err = errors.Wrap(e.addRule(*rule), "add 'allow' exception")
}
return err
}
func (e *Emulator) deny(rule *deviceRule) error {
// This cgroup is configured as a white-list. Reset the entire emulator,
// and put is into white-list mode.
if rule == nil || rule.meta.node == devices.WildcardDevice {
*e = Emulator{
defaultAllow: false,
rules: nil,
}
return nil
}
var err error
if e.defaultAllow {
err = errors.Wrap(e.addRule(*rule), "add 'deny' exception")
} else {
err = errors.Wrap(e.rmRule(*rule), "remove 'allow' exception")
}
return err
}
func (e *Emulator) Apply(rule devices.Rule) error {
if !rule.Type.CanCgroup() {
return errors.Errorf("cannot add rule [%#v] with non-cgroup type %q", rule, rule.Type)
}
innerRule := &deviceRule{
meta: deviceMeta{
node: rule.Type,
major: rule.Major,
minor: rule.Minor,
},
perms: rule.Permissions,
}
if innerRule.meta.node == devices.WildcardDevice {
innerRule = nil
}
if rule.Allow {
return e.allow(innerRule)
}
return e.deny(innerRule)
}
// EmulatorFromList takes a reader to a "devices.list"-like source, and returns
// a new Emulator that represents the state of the devices cgroup. Note that
// black-list devices cgroups cannot be fully reconstructed, due to limitations
// in the devices cgroup API. Instead, such cgroups are always treated as
// "allow all" cgroups.
func EmulatorFromList(list io.Reader) (*Emulator, error) {
// Normally cgroups are in black-list mode by default, but the way we
// figure out the current mode is whether or not devices.list has an
// allow-all rule. So we default to a white-list, and the existence of an
// "a *:* rwm" entry will tell us otherwise.
e := &Emulator{
defaultAllow: false,
}
// Parse the "devices.list".
s := bufio.NewScanner(list)
for s.Scan() {
line := s.Text()
deviceRule, err := parseLine(line)
if err != nil {
return nil, errors.Wrapf(err, "parsing line %q", line)
}
// "devices.list" is an allow list. Note that this means that in
// black-list mode, we have no idea what rules are in play. As a
// result, we need to be very careful in Transition().
if err := e.allow(deviceRule); err != nil {
return nil, errors.Wrapf(err, "adding devices.list rule")
}
}
if err := s.Err(); err != nil {
return nil, errors.Wrap(err, "reading devices.list lines")
}
return e, nil
}
// Transition calculates what is the minimally-disruptive set of rules need to
// be applied to a devices cgroup in order to transition to the given target.
// This means that any already-existing rules will not be applied, and
// disruptive rules (like denying all device access) will only be applied if
// necessary.
//
// This function is the sole reason for all of Emulator -- to allow us
// to figure out how to update a containers' cgroups without causing spurrious
// device errors (if possible).
func (source *Emulator) Transition(target *Emulator) ([]*devices.Rule, error) {
var transitionRules []*devices.Rule
oldRules := source.rules
// If the default policy doesn't match, we need to include a "disruptive"
// rule (either allow-all or deny-all) in order to switch the cgroup to the
// correct default policy.
//
// However, due to a limitation in "devices.list" we cannot be sure what
// deny rules are in place in a black-list cgroup. Thus if the source is a
// black-list we also have to include a disruptive rule.
if source.IsBlacklist() || source.defaultAllow != target.defaultAllow {
transitionRules = append(transitionRules, &devices.Rule{
Type: 'a',
Major: -1,
Minor: -1,
Permissions: devices.Permissions("rwm"),
Allow: target.defaultAllow,
})
// The old rules are only relevant if we aren't starting out with a
// disruptive rule.
oldRules = nil
}
// NOTE: We traverse through the rules in a sorted order so we always write
// the same set of rules (this is to aid testing).
// First, we create inverse rules for any old rules not in the new set.
// This includes partial-inverse rules for specific permissions. This is a
// no-op if we added a disruptive rule, since oldRules will be empty.
for _, rule := range oldRules.orderedEntries() {
meta, oldPerms := rule.meta, rule.perms
newPerms := target.rules[meta]
droppedPerms := oldPerms.Difference(newPerms)
if !droppedPerms.IsEmpty() {
transitionRules = append(transitionRules, &devices.Rule{
Type: meta.node,
Major: meta.major,
Minor: meta.minor,
Permissions: droppedPerms,
Allow: target.defaultAllow,
})
}
}
// Add any additional rules which weren't in the old set. We happen to
// filter out rules which are present in both sets, though this isn't
// strictly necessary.
for _, rule := range target.rules.orderedEntries() {
meta, newPerms := rule.meta, rule.perms
oldPerms := oldRules[meta]
gainedPerms := newPerms.Difference(oldPerms)
if !gainedPerms.IsEmpty() {
transitionRules = append(transitionRules, &devices.Rule{
Type: meta.node,
Major: meta.major,
Minor: meta.minor,
Permissions: gainedPerms,
Allow: !target.defaultAllow,
})
}
}
return transitionRules, nil
}
// Rules returns the minimum set of rules necessary to convert a *deny-all*
// cgroup to the emulated filter state (note that this is not the same as a
// default cgroupv1 cgroup -- which is allow-all). This is effectively just a
// wrapper around Transition() with the source emulator being an empty cgroup.
func (e *Emulator) Rules() ([]*devices.Rule, error) {
defaultCgroup := &Emulator{defaultAllow: false}
return defaultCgroup.Transition(e)
}

View File

@ -0,0 +1,208 @@
// Package devicefilter contains eBPF device filter program
//
// The implementation is based on https://github.com/containers/crun/blob/0.10.2/src/libcrun/ebpf.c
//
// Although ebpf.c is originally licensed under LGPL-3.0-or-later, the author (Giuseppe Scrivano)
// agreed to relicense the file in Apache License 2.0: https://github.com/opencontainers/runc/issues/2144#issuecomment-543116397
package devicefilter
import (
"math"
"strconv"
"github.com/cilium/ebpf/asm"
devicesemulator "github.com/opencontainers/runc/libcontainer/cgroups/devices"
"github.com/opencontainers/runc/libcontainer/devices"
"github.com/pkg/errors"
"golang.org/x/sys/unix"
)
const (
// license string format is same as kernel MODULE_LICENSE macro
license = "Apache"
)
// DeviceFilter returns eBPF device filter program and its license string
func DeviceFilter(rules []*devices.Rule) (asm.Instructions, string, error) {
// Generate the minimum ruleset for the device rules we are given. While we
// don't care about minimum transitions in cgroupv2, using the emulator
// gives us a guarantee that the behaviour of devices filtering is the same
// as cgroupv1, including security hardenings to avoid misconfiguration
// (such as punching holes in wildcard rules).
emu := new(devicesemulator.Emulator)
for _, rule := range rules {
if err := emu.Apply(*rule); err != nil {
return nil, "", err
}
}
cleanRules, err := emu.Rules()
if err != nil {
return nil, "", err
}
p := &program{
defaultAllow: emu.IsBlacklist(),
}
p.init()
for idx, rule := range cleanRules {
if rule.Type == devices.WildcardDevice {
// We can safely skip over wildcard entries because there should
// only be one (at most) at the very start to instruct cgroupv1 to
// go into allow-list mode. However we do double-check this here.
if idx != 0 || rule.Allow != emu.IsBlacklist() {
return nil, "", errors.Errorf("[internal error] emulated cgroupv2 devices ruleset had bad wildcard at idx %v (%s)", idx, rule.CgroupString())
}
continue
}
if rule.Allow == p.defaultAllow {
// There should be no rules which have an action equal to the
// default action, the emulator removes those.
return nil, "", errors.Errorf("[internal error] emulated cgroupv2 devices ruleset had no-op rule at idx %v (%s)", idx, rule.CgroupString())
}
if err := p.appendRule(rule); err != nil {
return nil, "", err
}
}
insts, err := p.finalize()
return insts, license, err
}
type program struct {
insts asm.Instructions
defaultAllow bool
blockID int
}
func (p *program) init() {
// struct bpf_cgroup_dev_ctx: https://elixir.bootlin.com/linux/v5.3.6/source/include/uapi/linux/bpf.h#L3423
/*
u32 access_type
u32 major
u32 minor
*/
// R2 <- type (lower 16 bit of u32 access_type at R1[0])
p.insts = append(p.insts,
asm.LoadMem(asm.R2, asm.R1, 0, asm.Word),
asm.And.Imm32(asm.R2, 0xFFFF))
// R3 <- access (upper 16 bit of u32 access_type at R1[0])
p.insts = append(p.insts,
asm.LoadMem(asm.R3, asm.R1, 0, asm.Word),
// RSh: bitwise shift right
asm.RSh.Imm32(asm.R3, 16))
// R4 <- major (u32 major at R1[4])
p.insts = append(p.insts,
asm.LoadMem(asm.R4, asm.R1, 4, asm.Word))
// R5 <- minor (u32 minor at R1[8])
p.insts = append(p.insts,
asm.LoadMem(asm.R5, asm.R1, 8, asm.Word))
}
// appendRule rule converts an OCI rule to the relevant eBPF block and adds it
// to the in-progress filter program. In order to operate properly, it must be
// called with a "clean" rule list (generated by devices.Emulator.Rules() --
// with any "a" rules removed).
func (p *program) appendRule(rule *devices.Rule) error {
if p.blockID < 0 {
return errors.New("the program is finalized")
}
var bpfType int32
switch rule.Type {
case devices.CharDevice:
bpfType = int32(unix.BPF_DEVCG_DEV_CHAR)
case devices.BlockDevice:
bpfType = int32(unix.BPF_DEVCG_DEV_BLOCK)
default:
// We do not permit 'a', nor any other types we don't know about.
return errors.Errorf("invalid type %q", string(rule.Type))
}
if rule.Major > math.MaxUint32 {
return errors.Errorf("invalid major %d", rule.Major)
}
if rule.Minor > math.MaxUint32 {
return errors.Errorf("invalid minor %d", rule.Major)
}
hasMajor := rule.Major >= 0 // if not specified in OCI json, major is set to -1
hasMinor := rule.Minor >= 0
bpfAccess := int32(0)
for _, r := range rule.Permissions {
switch r {
case 'r':
bpfAccess |= unix.BPF_DEVCG_ACC_READ
case 'w':
bpfAccess |= unix.BPF_DEVCG_ACC_WRITE
case 'm':
bpfAccess |= unix.BPF_DEVCG_ACC_MKNOD
default:
return errors.Errorf("unknown device access %v", r)
}
}
// If the access is rwm, skip the check.
hasAccess := bpfAccess != (unix.BPF_DEVCG_ACC_READ | unix.BPF_DEVCG_ACC_WRITE | unix.BPF_DEVCG_ACC_MKNOD)
var (
blockSym = "block-" + strconv.Itoa(p.blockID)
nextBlockSym = "block-" + strconv.Itoa(p.blockID+1)
prevBlockLastIdx = len(p.insts) - 1
)
p.insts = append(p.insts,
// if (R2 != bpfType) goto next
asm.JNE.Imm(asm.R2, bpfType, nextBlockSym),
)
if hasAccess {
p.insts = append(p.insts,
// if (R3 & bpfAccess != R3 /* use R1 as a temp var */) goto next
asm.Mov.Reg32(asm.R1, asm.R3),
asm.And.Imm32(asm.R1, bpfAccess),
asm.JNE.Reg(asm.R1, asm.R3, nextBlockSym),
)
}
if hasMajor {
p.insts = append(p.insts,
// if (R4 != major) goto next
asm.JNE.Imm(asm.R4, int32(rule.Major), nextBlockSym),
)
}
if hasMinor {
p.insts = append(p.insts,
// if (R5 != minor) goto next
asm.JNE.Imm(asm.R5, int32(rule.Minor), nextBlockSym),
)
}
p.insts = append(p.insts, acceptBlock(rule.Allow)...)
// set blockSym to the first instruction we added in this iteration
p.insts[prevBlockLastIdx+1] = p.insts[prevBlockLastIdx+1].Sym(blockSym)
p.blockID++
return nil
}
func (p *program) finalize() (asm.Instructions, error) {
var v int32
if p.defaultAllow {
v = 1
}
blockSym := "block-" + strconv.Itoa(p.blockID)
p.insts = append(p.insts,
// R0 <- v
asm.Mov.Imm32(asm.R0, v).Sym(blockSym),
asm.Return(),
)
p.blockID = -1
return p.insts, nil
}
func acceptBlock(accept bool) asm.Instructions {
var v int32
if accept {
v = 1
}
return []asm.Instruction{
// R0 <- v
asm.Mov.Imm32(asm.R0, v),
asm.Return(),
}
}

View File

@ -0,0 +1,253 @@
package ebpf
import (
"fmt"
"os"
"runtime"
"sync"
"unsafe"
"github.com/cilium/ebpf"
"github.com/cilium/ebpf/asm"
"github.com/cilium/ebpf/link"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)
func nilCloser() error {
return nil
}
func findAttachedCgroupDeviceFilters(dirFd int) ([]*ebpf.Program, error) {
type bpfAttrQuery struct {
TargetFd uint32
AttachType uint32
QueryType uint32
AttachFlags uint32
ProgIds uint64 // __aligned_u64
ProgCnt uint32
}
// Currently you can only have 64 eBPF programs attached to a cgroup.
size := 64
retries := 0
for retries < 10 {
progIds := make([]uint32, size)
query := bpfAttrQuery{
TargetFd: uint32(dirFd),
AttachType: uint32(unix.BPF_CGROUP_DEVICE),
ProgIds: uint64(uintptr(unsafe.Pointer(&progIds[0]))),
ProgCnt: uint32(len(progIds)),
}
// Fetch the list of program ids.
_, _, errno := unix.Syscall(unix.SYS_BPF,
uintptr(unix.BPF_PROG_QUERY),
uintptr(unsafe.Pointer(&query)),
unsafe.Sizeof(query))
size = int(query.ProgCnt)
runtime.KeepAlive(query)
if errno != 0 {
// On ENOSPC we get the correct number of programs.
if errno == unix.ENOSPC {
retries++
continue
}
return nil, fmt.Errorf("bpf_prog_query(BPF_CGROUP_DEVICE) failed: %w", errno)
}
// Convert the ids to program handles.
progIds = progIds[:size]
programs := make([]*ebpf.Program, 0, len(progIds))
for _, progId := range progIds {
program, err := ebpf.NewProgramFromID(ebpf.ProgramID(progId))
if err != nil {
// We skip over programs that give us -EACCES or -EPERM. This
// is necessary because there may be BPF programs that have
// been attached (such as with --systemd-cgroup) which have an
// LSM label that blocks us from interacting with the program.
//
// Because additional BPF_CGROUP_DEVICE programs only can add
// restrictions, there's no real issue with just ignoring these
// programs (and stops runc from breaking on distributions with
// very strict SELinux policies).
if errors.Is(err, os.ErrPermission) {
logrus.Debugf("ignoring existing CGROUP_DEVICE program (prog_id=%v) which cannot be accessed by runc -- likely due to LSM policy: %v", progId, err)
continue
}
return nil, fmt.Errorf("cannot fetch program from id: %w", err)
}
programs = append(programs, program)
}
runtime.KeepAlive(progIds)
return programs, nil
}
return nil, errors.New("could not get complete list of CGROUP_DEVICE programs")
}
var (
haveBpfProgReplaceBool bool
haveBpfProgReplaceOnce sync.Once
)
// Loosely based on the BPF_F_REPLACE support check in
// <https://github.com/cilium/ebpf/blob/v0.6.0/link/syscalls.go>.
//
// TODO: move this logic to cilium/ebpf
func haveBpfProgReplace() bool {
haveBpfProgReplaceOnce.Do(func() {
prog, err := ebpf.NewProgram(&ebpf.ProgramSpec{
Type: ebpf.CGroupDevice,
License: "MIT",
Instructions: asm.Instructions{
asm.Mov.Imm(asm.R0, 0),
asm.Return(),
},
})
if err != nil {
logrus.Debugf("checking for BPF_F_REPLACE support: ebpf.NewProgram failed: %v", err)
return
}
defer prog.Close()
devnull, err := os.Open("/dev/null")
if err != nil {
logrus.Debugf("checking for BPF_F_REPLACE support: open dummy target fd: %v", err)
return
}
defer devnull.Close()
// We know that we have BPF_PROG_ATTACH since we can load
// BPF_CGROUP_DEVICE programs. If passing BPF_F_REPLACE gives us EINVAL
// we know that the feature isn't present.
err = link.RawAttachProgram(link.RawAttachProgramOptions{
// We rely on this fd being checked after attachFlags.
Target: int(devnull.Fd()),
// Attempt to "replace" bad fds with this program.
Program: prog,
Attach: ebpf.AttachCGroupDevice,
Flags: unix.BPF_F_ALLOW_MULTI | unix.BPF_F_REPLACE,
})
if errors.Is(err, unix.EINVAL) {
// not supported
return
}
// attach_flags test succeded.
if !errors.Is(err, unix.EBADF) {
logrus.Debugf("checking for BPF_F_REPLACE: got unexpected (not EBADF or EINVAL) error: %v", err)
}
haveBpfProgReplaceBool = true
})
return haveBpfProgReplaceBool
}
// LoadAttachCgroupDeviceFilter installs eBPF device filter program to /sys/fs/cgroup/<foo> directory.
//
// Requires the system to be running in cgroup2 unified-mode with kernel >= 4.15 .
//
// https://github.com/torvalds/linux/commit/ebc614f687369f9df99828572b1d85a7c2de3d92
func LoadAttachCgroupDeviceFilter(insts asm.Instructions, license string, dirFd int) (func() error, error) {
// Increase `ulimit -l` limit to avoid BPF_PROG_LOAD error (#2167).
// This limit is not inherited into the container.
memlockLimit := &unix.Rlimit{
Cur: unix.RLIM_INFINITY,
Max: unix.RLIM_INFINITY,
}
_ = unix.Setrlimit(unix.RLIMIT_MEMLOCK, memlockLimit)
// Get the list of existing programs.
oldProgs, err := findAttachedCgroupDeviceFilters(dirFd)
if err != nil {
return nilCloser, err
}
useReplaceProg := haveBpfProgReplace() && len(oldProgs) == 1
// Generate new program.
spec := &ebpf.ProgramSpec{
Type: ebpf.CGroupDevice,
Instructions: insts,
License: license,
}
prog, err := ebpf.NewProgram(spec)
if err != nil {
return nilCloser, err
}
// If there is only one old program, we can just replace it directly.
var (
replaceProg *ebpf.Program
attachFlags uint32 = unix.BPF_F_ALLOW_MULTI
)
if useReplaceProg {
replaceProg = oldProgs[0]
attachFlags |= unix.BPF_F_REPLACE
}
err = link.RawAttachProgram(link.RawAttachProgramOptions{
Target: dirFd,
Program: prog,
Replace: replaceProg,
Attach: ebpf.AttachCGroupDevice,
Flags: attachFlags,
})
if err != nil {
return nilCloser, fmt.Errorf("failed to call BPF_PROG_ATTACH (BPF_CGROUP_DEVICE, BPF_F_ALLOW_MULTI): %w", err)
}
closer := func() error {
err = link.RawDetachProgram(link.RawDetachProgramOptions{
Target: dirFd,
Program: prog,
Attach: ebpf.AttachCGroupDevice,
})
if err != nil {
return fmt.Errorf("failed to call BPF_PROG_DETACH (BPF_CGROUP_DEVICE): %w", err)
}
// TODO: Should we attach the old filters back in this case? Otherwise
// we fail-open on a security feature, which is a bit scary.
return nil
}
if !useReplaceProg {
logLevel := logrus.DebugLevel
// If there was more than one old program, give a warning (since this
// really shouldn't happen with runc-managed cgroups) and then detach
// all the old programs.
if len(oldProgs) > 1 {
// NOTE: Ideally this should be a warning but it turns out that
// systemd-managed cgroups trigger this warning (apparently
// systemd doesn't delete old non-systemd programs when
// setting properties).
logrus.Infof("found more than one filter (%d) attached to a cgroup -- removing extra filters!", len(oldProgs))
logLevel = logrus.InfoLevel
}
for idx, oldProg := range oldProgs {
// Output some extra debug info.
if info, err := oldProg.Info(); err == nil {
fields := logrus.Fields{
"type": info.Type.String(),
"tag": info.Tag,
"name": info.Name,
}
if id, ok := info.ID(); ok {
fields["id"] = id
}
if runCount, ok := info.RunCount(); ok {
fields["run_count"] = runCount
}
if runtime, ok := info.Runtime(); ok {
fields["runtime"] = runtime.String()
}
logrus.WithFields(fields).Logf(logLevel, "removing old filter %d from cgroup", idx)
}
err = link.RawDetachProgram(link.RawDetachProgramOptions{
Target: dirFd,
Program: oldProg,
Attach: ebpf.AttachCGroupDevice,
})
if err != nil {
return closer, fmt.Errorf("failed to call BPF_PROG_DETACH (BPF_CGROUP_DEVICE) on old filter program: %w", err)
}
}
}
return closer, nil
}

View File

@ -0,0 +1,166 @@
package cgroups
import (
"bytes"
"os"
"strings"
"sync"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)
// OpenFile opens a cgroup file in a given dir with given flags.
// It is supposed to be used for cgroup files only.
func OpenFile(dir, file string, flags int) (*os.File, error) {
if dir == "" {
return nil, errors.Errorf("no directory specified for %s", file)
}
return openFile(dir, file, flags)
}
// ReadFile reads data from a cgroup file in dir.
// It is supposed to be used for cgroup files only.
func ReadFile(dir, file string) (string, error) {
fd, err := OpenFile(dir, file, unix.O_RDONLY)
if err != nil {
return "", err
}
defer fd.Close()
var buf bytes.Buffer
_, err = buf.ReadFrom(fd)
return buf.String(), err
}
// WriteFile writes data to a cgroup file in dir.
// It is supposed to be used for cgroup files only.
func WriteFile(dir, file, data string) error {
fd, err := OpenFile(dir, file, unix.O_WRONLY)
if err != nil {
return err
}
defer fd.Close()
if err := retryingWriteFile(fd, data); err != nil {
return errors.Wrapf(err, "failed to write %q", data)
}
return nil
}
func retryingWriteFile(fd *os.File, data string) error {
for {
_, err := fd.Write([]byte(data))
if errors.Is(err, unix.EINTR) {
logrus.Infof("interrupted while writing %s to %s", data, fd.Name())
continue
}
return err
}
}
const (
cgroupfsDir = "/sys/fs/cgroup"
cgroupfsPrefix = cgroupfsDir + "/"
)
var (
// TestMode is set to true by unit tests that need "fake" cgroupfs.
TestMode bool
cgroupFd int = -1
prepOnce sync.Once
prepErr error
resolveFlags uint64
)
func prepareOpenat2() error {
prepOnce.Do(func() {
fd, err := unix.Openat2(-1, cgroupfsDir, &unix.OpenHow{
Flags: unix.O_DIRECTORY | unix.O_PATH,
})
if err != nil {
prepErr = &os.PathError{Op: "openat2", Path: cgroupfsDir, Err: err}
if err != unix.ENOSYS {
logrus.Warnf("falling back to securejoin: %s", prepErr)
} else {
logrus.Debug("openat2 not available, falling back to securejoin")
}
return
}
var st unix.Statfs_t
if err = unix.Fstatfs(fd, &st); err != nil {
prepErr = &os.PathError{Op: "statfs", Path: cgroupfsDir, Err: err}
logrus.Warnf("falling back to securejoin: %s", prepErr)
return
}
cgroupFd = fd
resolveFlags = unix.RESOLVE_BENEATH | unix.RESOLVE_NO_MAGICLINKS
if st.Type == unix.CGROUP2_SUPER_MAGIC {
// cgroupv2 has a single mountpoint and no "cpu,cpuacct" symlinks
resolveFlags |= unix.RESOLVE_NO_XDEV | unix.RESOLVE_NO_SYMLINKS
}
})
return prepErr
}
// OpenFile opens a cgroup file in a given dir with given flags.
// It is supposed to be used for cgroup files only.
func openFile(dir, file string, flags int) (*os.File, error) {
mode := os.FileMode(0)
if TestMode && flags&os.O_WRONLY != 0 {
// "emulate" cgroup fs for unit tests
flags |= os.O_TRUNC | os.O_CREATE
mode = 0o600
}
if prepareOpenat2() != nil {
return openFallback(dir, file, flags, mode)
}
reldir := strings.TrimPrefix(dir, cgroupfsPrefix)
if len(reldir) == len(dir) { // non-standard path, old system?
return openFallback(dir, file, flags, mode)
}
relname := reldir + "/" + file
fd, err := unix.Openat2(cgroupFd, relname,
&unix.OpenHow{
Resolve: resolveFlags,
Flags: uint64(flags) | unix.O_CLOEXEC,
Mode: uint64(mode),
})
if err != nil {
return nil, &os.PathError{Op: "openat2", Path: dir + "/" + file, Err: err}
}
return os.NewFile(uintptr(fd), cgroupfsPrefix+relname), nil
}
var errNotCgroupfs = errors.New("not a cgroup file")
// openFallback is used when openat2(2) is not available. It checks the opened
// file is on cgroupfs, returning an error otherwise.
func openFallback(dir, file string, flags int, mode os.FileMode) (*os.File, error) {
path := dir + "/" + file
fd, err := os.OpenFile(path, flags, mode)
if err != nil {
return nil, err
}
if TestMode {
return fd, nil
}
// Check this is a cgroupfs file.
var st unix.Statfs_t
if err := unix.Fstatfs(int(fd.Fd()), &st); err != nil {
_ = fd.Close()
return nil, &os.PathError{Op: "statfs", Path: path, Err: err}
}
if st.Type != unix.CGROUP_SUPER_MAGIC && st.Type != unix.CGROUP2_SUPER_MAGIC {
_ = fd.Close()
return nil, &os.PathError{Op: "open", Path: path, Err: errNotCgroupfs}
}
return fd, nil
}

View File

@ -0,0 +1,311 @@
// +build linux
package fs
import (
"bufio"
"fmt"
"os"
"path/filepath"
"strconv"
"strings"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
)
type BlkioGroup struct {
weightFilename string
weightDeviceFilename string
}
func (s *BlkioGroup) Name() string {
return "blkio"
}
func (s *BlkioGroup) Apply(path string, d *cgroupData) error {
return join(path, d.pid)
}
func (s *BlkioGroup) Set(path string, r *configs.Resources) error {
s.detectWeightFilenames(path)
if r.BlkioWeight != 0 {
if err := cgroups.WriteFile(path, s.weightFilename, strconv.FormatUint(uint64(r.BlkioWeight), 10)); err != nil {
return err
}
}
if r.BlkioLeafWeight != 0 {
if err := cgroups.WriteFile(path, "blkio.leaf_weight", strconv.FormatUint(uint64(r.BlkioLeafWeight), 10)); err != nil {
return err
}
}
for _, wd := range r.BlkioWeightDevice {
if wd.Weight != 0 {
if err := cgroups.WriteFile(path, s.weightDeviceFilename, wd.WeightString()); err != nil {
return err
}
}
if wd.LeafWeight != 0 {
if err := cgroups.WriteFile(path, "blkio.leaf_weight_device", wd.LeafWeightString()); err != nil {
return err
}
}
}
for _, td := range r.BlkioThrottleReadBpsDevice {
if err := cgroups.WriteFile(path, "blkio.throttle.read_bps_device", td.String()); err != nil {
return err
}
}
for _, td := range r.BlkioThrottleWriteBpsDevice {
if err := cgroups.WriteFile(path, "blkio.throttle.write_bps_device", td.String()); err != nil {
return err
}
}
for _, td := range r.BlkioThrottleReadIOPSDevice {
if err := cgroups.WriteFile(path, "blkio.throttle.read_iops_device", td.String()); err != nil {
return err
}
}
for _, td := range r.BlkioThrottleWriteIOPSDevice {
if err := cgroups.WriteFile(path, "blkio.throttle.write_iops_device", td.String()); err != nil {
return err
}
}
return nil
}
/*
examples:
blkio.sectors
8:0 6792
blkio.io_service_bytes
8:0 Read 1282048
8:0 Write 2195456
8:0 Sync 2195456
8:0 Async 1282048
8:0 Total 3477504
Total 3477504
blkio.io_serviced
8:0 Read 124
8:0 Write 104
8:0 Sync 104
8:0 Async 124
8:0 Total 228
Total 228
blkio.io_queued
8:0 Read 0
8:0 Write 0
8:0 Sync 0
8:0 Async 0
8:0 Total 0
Total 0
*/
func splitBlkioStatLine(r rune) bool {
return r == ' ' || r == ':'
}
func getBlkioStat(dir, file string) ([]cgroups.BlkioStatEntry, error) {
var blkioStats []cgroups.BlkioStatEntry
f, err := cgroups.OpenFile(dir, file, os.O_RDONLY)
if err != nil {
if os.IsNotExist(err) {
return blkioStats, nil
}
return nil, err
}
defer f.Close()
sc := bufio.NewScanner(f)
for sc.Scan() {
// format: dev type amount
fields := strings.FieldsFunc(sc.Text(), splitBlkioStatLine)
if len(fields) < 3 {
if len(fields) == 2 && fields[0] == "Total" {
// skip total line
continue
} else {
return nil, fmt.Errorf("Invalid line found while parsing %s/%s: %s", dir, file, sc.Text())
}
}
v, err := strconv.ParseUint(fields[0], 10, 64)
if err != nil {
return nil, err
}
major := v
v, err = strconv.ParseUint(fields[1], 10, 64)
if err != nil {
return nil, err
}
minor := v
op := ""
valueField := 2
if len(fields) == 4 {
op = fields[2]
valueField = 3
}
v, err = strconv.ParseUint(fields[valueField], 10, 64)
if err != nil {
return nil, err
}
blkioStats = append(blkioStats, cgroups.BlkioStatEntry{Major: major, Minor: minor, Op: op, Value: v})
}
return blkioStats, nil
}
func (s *BlkioGroup) GetStats(path string, stats *cgroups.Stats) error {
type blkioStatInfo struct {
filename string
blkioStatEntriesPtr *[]cgroups.BlkioStatEntry
}
bfqDebugStats := []blkioStatInfo{
{
filename: "blkio.bfq.sectors_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.SectorsRecursive,
},
{
filename: "blkio.bfq.io_service_time_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoServiceTimeRecursive,
},
{
filename: "blkio.bfq.io_wait_time_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoWaitTimeRecursive,
},
{
filename: "blkio.bfq.io_merged_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoMergedRecursive,
},
{
filename: "blkio.bfq.io_queued_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoQueuedRecursive,
},
{
filename: "blkio.bfq.time_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoTimeRecursive,
},
{
filename: "blkio.bfq.io_serviced_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive,
},
{
filename: "blkio.bfq.io_service_bytes_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive,
},
}
bfqStats := []blkioStatInfo{
{
filename: "blkio.bfq.io_serviced_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive,
},
{
filename: "blkio.bfq.io_service_bytes_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive,
},
}
cfqStats := []blkioStatInfo{
{
filename: "blkio.sectors_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.SectorsRecursive,
},
{
filename: "blkio.io_service_time_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoServiceTimeRecursive,
},
{
filename: "blkio.io_wait_time_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoWaitTimeRecursive,
},
{
filename: "blkio.io_merged_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoMergedRecursive,
},
{
filename: "blkio.io_queued_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoQueuedRecursive,
},
{
filename: "blkio.time_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoTimeRecursive,
},
{
filename: "blkio.io_serviced_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive,
},
{
filename: "blkio.io_service_bytes_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive,
},
}
throttleRecursiveStats := []blkioStatInfo{
{
filename: "blkio.throttle.io_serviced_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive,
},
{
filename: "blkio.throttle.io_service_bytes_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive,
},
}
baseStats := []blkioStatInfo{
{
filename: "blkio.throttle.io_serviced",
blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive,
},
{
filename: "blkio.throttle.io_service_bytes",
blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive,
},
}
orderedStats := [][]blkioStatInfo{
bfqDebugStats,
bfqStats,
cfqStats,
throttleRecursiveStats,
baseStats,
}
var blkioStats []cgroups.BlkioStatEntry
var err error
for _, statGroup := range orderedStats {
for i, statInfo := range statGroup {
if blkioStats, err = getBlkioStat(path, statInfo.filename); err != nil || blkioStats == nil {
// if error occurs on first file, move to next group
if i == 0 {
break
}
return err
}
*statInfo.blkioStatEntriesPtr = blkioStats
// finish if all stats are gathered
if i == len(statGroup)-1 {
return nil
}
}
}
return nil
}
func (s *BlkioGroup) detectWeightFilenames(path string) {
if s.weightFilename != "" {
// Already detected.
return
}
if cgroups.PathExists(filepath.Join(path, "blkio.weight")) {
s.weightFilename = "blkio.weight"
s.weightDeviceFilename = "blkio.weight_device"
} else {
s.weightFilename = "blkio.bfq.weight"
s.weightDeviceFilename = "blkio.bfq.weight_device"
}
}

View File

@ -0,0 +1,115 @@
// +build linux
package fs
import (
"bufio"
"fmt"
"os"
"strconv"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
)
type CpuGroup struct{}
func (s *CpuGroup) Name() string {
return "cpu"
}
func (s *CpuGroup) Apply(path string, d *cgroupData) error {
// This might happen if we have no cpu cgroup mounted.
// Just do nothing and don't fail.
if path == "" {
return nil
}
if err := os.MkdirAll(path, 0o755); err != nil {
return err
}
// We should set the real-Time group scheduling settings before moving
// in the process because if the process is already in SCHED_RR mode
// and no RT bandwidth is set, adding it will fail.
if err := s.SetRtSched(path, d.config.Resources); err != nil {
return err
}
// Since we are not using join(), we need to place the pid
// into the procs file unlike other subsystems.
return cgroups.WriteCgroupProc(path, d.pid)
}
func (s *CpuGroup) SetRtSched(path string, r *configs.Resources) error {
if r.CpuRtPeriod != 0 {
if err := cgroups.WriteFile(path, "cpu.rt_period_us", strconv.FormatUint(r.CpuRtPeriod, 10)); err != nil {
return err
}
}
if r.CpuRtRuntime != 0 {
if err := cgroups.WriteFile(path, "cpu.rt_runtime_us", strconv.FormatInt(r.CpuRtRuntime, 10)); err != nil {
return err
}
}
return nil
}
func (s *CpuGroup) Set(path string, r *configs.Resources) error {
if r.CpuShares != 0 {
shares := r.CpuShares
if err := cgroups.WriteFile(path, "cpu.shares", strconv.FormatUint(shares, 10)); err != nil {
return err
}
// read it back
sharesRead, err := fscommon.GetCgroupParamUint(path, "cpu.shares")
if err != nil {
return err
}
// ... and check
if shares > sharesRead {
return fmt.Errorf("the maximum allowed cpu-shares is %d", sharesRead)
} else if shares < sharesRead {
return fmt.Errorf("the minimum allowed cpu-shares is %d", sharesRead)
}
}
if r.CpuPeriod != 0 {
if err := cgroups.WriteFile(path, "cpu.cfs_period_us", strconv.FormatUint(r.CpuPeriod, 10)); err != nil {
return err
}
}
if r.CpuQuota != 0 {
if err := cgroups.WriteFile(path, "cpu.cfs_quota_us", strconv.FormatInt(r.CpuQuota, 10)); err != nil {
return err
}
}
return s.SetRtSched(path, r)
}
func (s *CpuGroup) GetStats(path string, stats *cgroups.Stats) error {
f, err := cgroups.OpenFile(path, "cpu.stat", os.O_RDONLY)
if err != nil {
if os.IsNotExist(err) {
return nil
}
return err
}
defer f.Close()
sc := bufio.NewScanner(f)
for sc.Scan() {
t, v, err := fscommon.ParseKeyValue(sc.Text())
if err != nil {
return err
}
switch t {
case "nr_periods":
stats.CpuStats.ThrottlingData.Periods = v
case "nr_throttled":
stats.CpuStats.ThrottlingData.ThrottledPeriods = v
case "throttled_time":
stats.CpuStats.ThrottlingData.ThrottledTime = v
}
}
return nil
}

View File

@ -0,0 +1,172 @@
// +build linux
package fs
import (
"bufio"
"fmt"
"os"
"path/filepath"
"strconv"
"strings"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
)
const (
cgroupCpuacctStat = "cpuacct.stat"
cgroupCpuacctUsageAll = "cpuacct.usage_all"
nanosecondsInSecond = 1000000000
userModeColumn = 1
kernelModeColumn = 2
cuacctUsageAllColumnsNumber = 3
// The value comes from `C.sysconf(C._SC_CLK_TCK)`, and
// on Linux it's a constant which is safe to be hard coded,
// so we can avoid using cgo here. For details, see:
// https://github.com/containerd/cgroups/pull/12
clockTicks uint64 = 100
)
type CpuacctGroup struct{}
func (s *CpuacctGroup) Name() string {
return "cpuacct"
}
func (s *CpuacctGroup) Apply(path string, d *cgroupData) error {
return join(path, d.pid)
}
func (s *CpuacctGroup) Set(_ string, _ *configs.Resources) error {
return nil
}
func (s *CpuacctGroup) GetStats(path string, stats *cgroups.Stats) error {
if !cgroups.PathExists(path) {
return nil
}
userModeUsage, kernelModeUsage, err := getCpuUsageBreakdown(path)
if err != nil {
return err
}
totalUsage, err := fscommon.GetCgroupParamUint(path, "cpuacct.usage")
if err != nil {
return err
}
percpuUsage, err := getPercpuUsage(path)
if err != nil {
return err
}
percpuUsageInKernelmode, percpuUsageInUsermode, err := getPercpuUsageInModes(path)
if err != nil {
return err
}
stats.CpuStats.CpuUsage.TotalUsage = totalUsage
stats.CpuStats.CpuUsage.PercpuUsage = percpuUsage
stats.CpuStats.CpuUsage.PercpuUsageInKernelmode = percpuUsageInKernelmode
stats.CpuStats.CpuUsage.PercpuUsageInUsermode = percpuUsageInUsermode
stats.CpuStats.CpuUsage.UsageInUsermode = userModeUsage
stats.CpuStats.CpuUsage.UsageInKernelmode = kernelModeUsage
return nil
}
// Returns user and kernel usage breakdown in nanoseconds.
func getCpuUsageBreakdown(path string) (uint64, uint64, error) {
var userModeUsage, kernelModeUsage uint64
const (
userField = "user"
systemField = "system"
)
// Expected format:
// user <usage in ticks>
// system <usage in ticks>
data, err := cgroups.ReadFile(path, cgroupCpuacctStat)
if err != nil {
return 0, 0, err
}
fields := strings.Fields(data)
if len(fields) < 4 {
return 0, 0, fmt.Errorf("failure - %s is expected to have at least 4 fields", filepath.Join(path, cgroupCpuacctStat))
}
if fields[0] != userField {
return 0, 0, fmt.Errorf("unexpected field %q in %q, expected %q", fields[0], cgroupCpuacctStat, userField)
}
if fields[2] != systemField {
return 0, 0, fmt.Errorf("unexpected field %q in %q, expected %q", fields[2], cgroupCpuacctStat, systemField)
}
if userModeUsage, err = strconv.ParseUint(fields[1], 10, 64); err != nil {
return 0, 0, err
}
if kernelModeUsage, err = strconv.ParseUint(fields[3], 10, 64); err != nil {
return 0, 0, err
}
return (userModeUsage * nanosecondsInSecond) / clockTicks, (kernelModeUsage * nanosecondsInSecond) / clockTicks, nil
}
func getPercpuUsage(path string) ([]uint64, error) {
percpuUsage := []uint64{}
data, err := cgroups.ReadFile(path, "cpuacct.usage_percpu")
if err != nil {
return percpuUsage, err
}
for _, value := range strings.Fields(data) {
value, err := strconv.ParseUint(value, 10, 64)
if err != nil {
return percpuUsage, fmt.Errorf("Unable to convert param value to uint64: %s", err)
}
percpuUsage = append(percpuUsage, value)
}
return percpuUsage, nil
}
func getPercpuUsageInModes(path string) ([]uint64, []uint64, error) {
usageKernelMode := []uint64{}
usageUserMode := []uint64{}
file, err := cgroups.OpenFile(path, cgroupCpuacctUsageAll, os.O_RDONLY)
if os.IsNotExist(err) {
return usageKernelMode, usageUserMode, nil
} else if err != nil {
return nil, nil, err
}
defer file.Close()
scanner := bufio.NewScanner(file)
scanner.Scan() // skipping header line
for scanner.Scan() {
lineFields := strings.SplitN(scanner.Text(), " ", cuacctUsageAllColumnsNumber+1)
if len(lineFields) != cuacctUsageAllColumnsNumber {
continue
}
usageInKernelMode, err := strconv.ParseUint(lineFields[kernelModeColumn], 10, 64)
if err != nil {
return nil, nil, fmt.Errorf("Unable to convert CPU usage in kernel mode to uint64: %s", err)
}
usageKernelMode = append(usageKernelMode, usageInKernelMode)
usageInUserMode, err := strconv.ParseUint(lineFields[userModeColumn], 10, 64)
if err != nil {
return nil, nil, fmt.Errorf("Unable to convert CPU usage in user mode to uint64: %s", err)
}
usageUserMode = append(usageUserMode, usageInUserMode)
}
if err := scanner.Err(); err != nil {
return nil, nil, fmt.Errorf("Problem in reading %s line by line, %s", cgroupCpuacctUsageAll, err)
}
return usageKernelMode, usageUserMode, nil
}

View File

@ -0,0 +1,248 @@
// +build linux
package fs
import (
"fmt"
"os"
"path/filepath"
"strconv"
"strings"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/pkg/errors"
"golang.org/x/sys/unix"
)
type CpusetGroup struct{}
func (s *CpusetGroup) Name() string {
return "cpuset"
}
func (s *CpusetGroup) Apply(path string, d *cgroupData) error {
return s.ApplyDir(path, d.config.Resources, d.pid)
}
func (s *CpusetGroup) Set(path string, r *configs.Resources) error {
if r.CpusetCpus != "" {
if err := cgroups.WriteFile(path, "cpuset.cpus", r.CpusetCpus); err != nil {
return err
}
}
if r.CpusetMems != "" {
if err := cgroups.WriteFile(path, "cpuset.mems", r.CpusetMems); err != nil {
return err
}
}
return nil
}
func getCpusetStat(path string, filename string) ([]uint16, error) {
var extracted []uint16
fileContent, err := fscommon.GetCgroupParamString(path, filename)
if err != nil {
return extracted, err
}
if len(fileContent) == 0 {
return extracted, fmt.Errorf("%s found to be empty", filepath.Join(path, filename))
}
for _, s := range strings.Split(fileContent, ",") {
splitted := strings.SplitN(s, "-", 3)
switch len(splitted) {
case 3:
return extracted, fmt.Errorf("invalid values in %s", filepath.Join(path, filename))
case 2:
min, err := strconv.ParseUint(splitted[0], 10, 16)
if err != nil {
return extracted, err
}
max, err := strconv.ParseUint(splitted[1], 10, 16)
if err != nil {
return extracted, err
}
if min > max {
return extracted, fmt.Errorf("invalid values in %s", filepath.Join(path, filename))
}
for i := min; i <= max; i++ {
extracted = append(extracted, uint16(i))
}
case 1:
value, err := strconv.ParseUint(s, 10, 16)
if err != nil {
return extracted, err
}
extracted = append(extracted, uint16(value))
}
}
return extracted, nil
}
func (s *CpusetGroup) GetStats(path string, stats *cgroups.Stats) error {
var err error
stats.CPUSetStats.CPUs, err = getCpusetStat(path, "cpuset.cpus")
if err != nil && !errors.Is(err, os.ErrNotExist) {
return err
}
stats.CPUSetStats.CPUExclusive, err = fscommon.GetCgroupParamUint(path, "cpuset.cpu_exclusive")
if err != nil && !errors.Is(err, os.ErrNotExist) {
return err
}
stats.CPUSetStats.Mems, err = getCpusetStat(path, "cpuset.mems")
if err != nil && !errors.Is(err, os.ErrNotExist) {
return err
}
stats.CPUSetStats.MemHardwall, err = fscommon.GetCgroupParamUint(path, "cpuset.mem_hardwall")
if err != nil && !errors.Is(err, os.ErrNotExist) {
return err
}
stats.CPUSetStats.MemExclusive, err = fscommon.GetCgroupParamUint(path, "cpuset.mem_exclusive")
if err != nil && !errors.Is(err, os.ErrNotExist) {
return err
}
stats.CPUSetStats.MemoryMigrate, err = fscommon.GetCgroupParamUint(path, "cpuset.memory_migrate")
if err != nil && !errors.Is(err, os.ErrNotExist) {
return err
}
stats.CPUSetStats.MemorySpreadPage, err = fscommon.GetCgroupParamUint(path, "cpuset.memory_spread_page")
if err != nil && !errors.Is(err, os.ErrNotExist) {
return err
}
stats.CPUSetStats.MemorySpreadSlab, err = fscommon.GetCgroupParamUint(path, "cpuset.memory_spread_slab")
if err != nil && !errors.Is(err, os.ErrNotExist) {
return err
}
stats.CPUSetStats.MemoryPressure, err = fscommon.GetCgroupParamUint(path, "cpuset.memory_pressure")
if err != nil && !errors.Is(err, os.ErrNotExist) {
return err
}
stats.CPUSetStats.SchedLoadBalance, err = fscommon.GetCgroupParamUint(path, "cpuset.sched_load_balance")
if err != nil && !errors.Is(err, os.ErrNotExist) {
return err
}
stats.CPUSetStats.SchedRelaxDomainLevel, err = fscommon.GetCgroupParamInt(path, "cpuset.sched_relax_domain_level")
if err != nil && !errors.Is(err, os.ErrNotExist) {
return err
}
return nil
}
func (s *CpusetGroup) ApplyDir(dir string, r *configs.Resources, pid int) error {
// This might happen if we have no cpuset cgroup mounted.
// Just do nothing and don't fail.
if dir == "" {
return nil
}
// 'ensureParent' start with parent because we don't want to
// explicitly inherit from parent, it could conflict with
// 'cpuset.cpu_exclusive'.
if err := cpusetEnsureParent(filepath.Dir(dir)); err != nil {
return err
}
if err := os.Mkdir(dir, 0o755); err != nil && !os.IsExist(err) {
return err
}
// We didn't inherit cpuset configs from parent, but we have
// to ensure cpuset configs are set before moving task into the
// cgroup.
// The logic is, if user specified cpuset configs, use these
// specified configs, otherwise, inherit from parent. This makes
// cpuset configs work correctly with 'cpuset.cpu_exclusive', and
// keep backward compatibility.
if err := s.ensureCpusAndMems(dir, r); err != nil {
return err
}
// because we are not using d.join we need to place the pid into the procs file
// unlike the other subsystems
return cgroups.WriteCgroupProc(dir, pid)
}
func getCpusetSubsystemSettings(parent string) (cpus, mems string, err error) {
if cpus, err = cgroups.ReadFile(parent, "cpuset.cpus"); err != nil {
return
}
if mems, err = cgroups.ReadFile(parent, "cpuset.mems"); err != nil {
return
}
return cpus, mems, nil
}
// cpusetEnsureParent makes sure that the parent directories of current
// are created and populated with the proper cpus and mems files copied
// from their respective parent. It does that recursively, starting from
// the top of the cpuset hierarchy (i.e. cpuset cgroup mount point).
func cpusetEnsureParent(current string) error {
var st unix.Statfs_t
parent := filepath.Dir(current)
err := unix.Statfs(parent, &st)
if err == nil && st.Type != unix.CGROUP_SUPER_MAGIC {
return nil
}
// Treat non-existing directory as cgroupfs as it will be created,
// and the root cpuset directory obviously exists.
if err != nil && err != unix.ENOENT {
return &os.PathError{Op: "statfs", Path: parent, Err: err}
}
if err := cpusetEnsureParent(parent); err != nil {
return err
}
if err := os.Mkdir(current, 0o755); err != nil && !os.IsExist(err) {
return err
}
return cpusetCopyIfNeeded(current, parent)
}
// cpusetCopyIfNeeded copies the cpuset.cpus and cpuset.mems from the parent
// directory to the current directory if the file's contents are 0
func cpusetCopyIfNeeded(current, parent string) error {
currentCpus, currentMems, err := getCpusetSubsystemSettings(current)
if err != nil {
return err
}
parentCpus, parentMems, err := getCpusetSubsystemSettings(parent)
if err != nil {
return err
}
if isEmptyCpuset(currentCpus) {
if err := cgroups.WriteFile(current, "cpuset.cpus", string(parentCpus)); err != nil {
return err
}
}
if isEmptyCpuset(currentMems) {
if err := cgroups.WriteFile(current, "cpuset.mems", string(parentMems)); err != nil {
return err
}
}
return nil
}
func isEmptyCpuset(str string) bool {
return str == "" || str == "\n"
}
func (s *CpusetGroup) ensureCpusAndMems(path string, r *configs.Resources) error {
if err := s.Set(path, r); err != nil {
return err
}
return cpusetCopyIfNeeded(path, filepath.Dir(path))
}

View File

@ -0,0 +1,110 @@
// +build linux
package fs
import (
"bytes"
"errors"
"reflect"
"github.com/opencontainers/runc/libcontainer/cgroups"
cgroupdevices "github.com/opencontainers/runc/libcontainer/cgroups/devices"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/devices"
"github.com/opencontainers/runc/libcontainer/userns"
)
type DevicesGroup struct {
testingSkipFinalCheck bool
}
func (s *DevicesGroup) Name() string {
return "devices"
}
func (s *DevicesGroup) Apply(path string, d *cgroupData) error {
if d.config.SkipDevices {
return nil
}
if path == "" {
// Return error here, since devices cgroup
// is a hard requirement for container's security.
return errSubsystemDoesNotExist
}
return join(path, d.pid)
}
func loadEmulator(path string) (*cgroupdevices.Emulator, error) {
list, err := cgroups.ReadFile(path, "devices.list")
if err != nil {
return nil, err
}
return cgroupdevices.EmulatorFromList(bytes.NewBufferString(list))
}
func buildEmulator(rules []*devices.Rule) (*cgroupdevices.Emulator, error) {
// This defaults to a white-list -- which is what we want!
emu := &cgroupdevices.Emulator{}
for _, rule := range rules {
if err := emu.Apply(*rule); err != nil {
return nil, err
}
}
return emu, nil
}
func (s *DevicesGroup) Set(path string, r *configs.Resources) error {
if userns.RunningInUserNS() || r.SkipDevices {
return nil
}
// Generate two emulators, one for the current state of the cgroup and one
// for the requested state by the user.
current, err := loadEmulator(path)
if err != nil {
return err
}
target, err := buildEmulator(r.Devices)
if err != nil {
return err
}
// Compute the minimal set of transition rules needed to achieve the
// requested state.
transitionRules, err := current.Transition(target)
if err != nil {
return err
}
for _, rule := range transitionRules {
file := "devices.deny"
if rule.Allow {
file = "devices.allow"
}
if err := cgroups.WriteFile(path, file, rule.CgroupString()); err != nil {
return err
}
}
// Final safety check -- ensure that the resulting state is what was
// requested. This is only really correct for white-lists, but for
// black-lists we can at least check that the cgroup is in the right mode.
//
// This safety-check is skipped for the unit tests because we cannot
// currently mock devices.list correctly.
if !s.testingSkipFinalCheck {
currentAfter, err := loadEmulator(path)
if err != nil {
return err
}
if !target.IsBlacklist() && !reflect.DeepEqual(currentAfter, target) {
return errors.New("resulting devices cgroup doesn't precisely match target")
} else if target.IsBlacklist() != currentAfter.IsBlacklist() {
return errors.New("resulting devices cgroup doesn't match target mode")
}
}
return nil
}
func (s *DevicesGroup) GetStats(path string, stats *cgroups.Stats) error {
return nil
}

View File

@ -0,0 +1,160 @@
// +build linux
package fs
import (
"errors"
"fmt"
"os"
"strings"
"time"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)
type FreezerGroup struct{}
func (s *FreezerGroup) Name() string {
return "freezer"
}
func (s *FreezerGroup) Apply(path string, d *cgroupData) error {
return join(path, d.pid)
}
func (s *FreezerGroup) Set(path string, r *configs.Resources) (Err error) {
switch r.Freezer {
case configs.Frozen:
defer func() {
if Err != nil {
// Freezing failed, and it is bad and dangerous
// to leave the cgroup in FROZEN or FREEZING
// state, so (try to) thaw it back.
_ = cgroups.WriteFile(path, "freezer.state", string(configs.Thawed))
}
}()
// As per older kernel docs (freezer-subsystem.txt before
// kernel commit ef9fe980c6fcc1821), if FREEZING is seen,
// userspace should either retry or thaw. While current
// kernel cgroup v1 docs no longer mention a need to retry,
// even a recent kernel (v5.4, Ubuntu 20.04) can't reliably
// freeze a cgroup v1 while new processes keep appearing in it
// (either via fork/clone or by writing new PIDs to
// cgroup.procs).
//
// The numbers below are empirically chosen to have a decent
// chance to succeed in various scenarios ("runc pause/unpause
// with parallel runc exec" and "bare freeze/unfreeze on a very
// slow system"), tested on RHEL7 and Ubuntu 20.04 kernels.
//
// Adding any amount of sleep in between retries did not
// increase the chances of successful freeze in "pause/unpause
// with parallel exec" reproducer. OTOH, adding an occasional
// sleep helped for the case where the system is extremely slow
// (CentOS 7 VM on GHA CI).
//
// Alas, this is still a game of chances, since the real fix
// belong to the kernel (cgroup v2 do not have this bug).
for i := 0; i < 1000; i++ {
if i%50 == 49 {
// Occasional thaw and sleep improves
// the chances to succeed in freezing
// in case new processes keep appearing
// in the cgroup.
_ = cgroups.WriteFile(path, "freezer.state", string(configs.Thawed))
time.Sleep(10 * time.Millisecond)
}
if err := cgroups.WriteFile(path, "freezer.state", string(configs.Frozen)); err != nil {
return err
}
if i%25 == 24 {
// Occasional short sleep before reading
// the state back also improves the chances to
// succeed in freezing in case of a very slow
// system.
time.Sleep(10 * time.Microsecond)
}
state, err := cgroups.ReadFile(path, "freezer.state")
if err != nil {
return err
}
state = strings.TrimSpace(state)
switch state {
case "FREEZING":
continue
case string(configs.Frozen):
if i > 1 {
logrus.Debugf("frozen after %d retries", i)
}
return nil
default:
// should never happen
return fmt.Errorf("unexpected state %s while freezing", strings.TrimSpace(state))
}
}
// Despite our best efforts, it got stuck in FREEZING.
return errors.New("unable to freeze")
case configs.Thawed:
return cgroups.WriteFile(path, "freezer.state", string(configs.Thawed))
case configs.Undefined:
return nil
default:
return fmt.Errorf("Invalid argument '%s' to freezer.state", string(r.Freezer))
}
}
func (s *FreezerGroup) GetStats(path string, stats *cgroups.Stats) error {
return nil
}
func (s *FreezerGroup) GetState(path string) (configs.FreezerState, error) {
for {
state, err := cgroups.ReadFile(path, "freezer.state")
if err != nil {
// If the kernel is too old, then we just treat the freezer as
// being in an "undefined" state.
if os.IsNotExist(err) || errors.Is(err, unix.ENODEV) {
err = nil
}
return configs.Undefined, err
}
switch strings.TrimSpace(state) {
case "THAWED":
return configs.Thawed, nil
case "FROZEN":
// Find out whether the cgroup is frozen directly,
// or indirectly via an ancestor.
self, err := cgroups.ReadFile(path, "freezer.self_freezing")
if err != nil {
// If the kernel is too old, then we just treat
// it as being frozen.
if errors.Is(err, os.ErrNotExist) || errors.Is(err, unix.ENODEV) {
err = nil
}
return configs.Frozen, err
}
switch self {
case "0\n":
return configs.Thawed, nil
case "1\n":
return configs.Frozen, nil
default:
return configs.Undefined, fmt.Errorf(`unknown "freezer.self_freezing" state: %q`, self)
}
case "FREEZING":
// Make sure we get a stable freezer state, so retry if the cgroup
// is still undergoing freezing. This should be a temporary delay.
time.Sleep(1 * time.Millisecond)
continue
default:
return configs.Undefined, fmt.Errorf("unknown freezer.state %q", state)
}
}
}

View File

@ -0,0 +1,440 @@
// +build linux
package fs
import (
"fmt"
"os"
"path/filepath"
"sync"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
"github.com/pkg/errors"
"golang.org/x/sys/unix"
)
var (
subsystems = []subsystem{
&CpusetGroup{},
&DevicesGroup{},
&MemoryGroup{},
&CpuGroup{},
&CpuacctGroup{},
&PidsGroup{},
&BlkioGroup{},
&HugetlbGroup{},
&NetClsGroup{},
&NetPrioGroup{},
&PerfEventGroup{},
&FreezerGroup{},
&NameGroup{GroupName: "name=systemd", Join: true},
}
HugePageSizes, _ = cgroups.GetHugePageSize()
)
var errSubsystemDoesNotExist = errors.New("cgroup: subsystem does not exist")
type subsystem interface {
// Name returns the name of the subsystem.
Name() string
// Returns the stats, as 'stats', corresponding to the cgroup under 'path'.
GetStats(path string, stats *cgroups.Stats) error
// Creates and joins the cgroup represented by 'cgroupData'.
Apply(path string, c *cgroupData) error
// Set sets the cgroup resources.
Set(path string, r *configs.Resources) error
}
type manager struct {
mu sync.Mutex
cgroups *configs.Cgroup
rootless bool // ignore permission-related errors
paths map[string]string
}
func NewManager(cg *configs.Cgroup, paths map[string]string, rootless bool) cgroups.Manager {
return &manager{
cgroups: cg,
paths: paths,
rootless: rootless,
}
}
// The absolute path to the root of the cgroup hierarchies.
var (
cgroupRootLock sync.Mutex
cgroupRoot string
)
const defaultCgroupRoot = "/sys/fs/cgroup"
func tryDefaultCgroupRoot() string {
var st, pst unix.Stat_t
// (1) it should be a directory...
err := unix.Lstat(defaultCgroupRoot, &st)
if err != nil || st.Mode&unix.S_IFDIR == 0 {
return ""
}
// (2) ... and a mount point ...
err = unix.Lstat(filepath.Dir(defaultCgroupRoot), &pst)
if err != nil {
return ""
}
if st.Dev == pst.Dev {
// parent dir has the same dev -- not a mount point
return ""
}
// (3) ... of 'tmpfs' fs type.
var fst unix.Statfs_t
err = unix.Statfs(defaultCgroupRoot, &fst)
if err != nil || fst.Type != unix.TMPFS_MAGIC {
return ""
}
// (4) it should have at least 1 entry ...
dir, err := os.Open(defaultCgroupRoot)
if err != nil {
return ""
}
names, err := dir.Readdirnames(1)
if err != nil {
return ""
}
if len(names) < 1 {
return ""
}
// ... which is a cgroup mount point.
err = unix.Statfs(filepath.Join(defaultCgroupRoot, names[0]), &fst)
if err != nil || fst.Type != unix.CGROUP_SUPER_MAGIC {
return ""
}
return defaultCgroupRoot
}
// Gets the cgroupRoot.
func getCgroupRoot() (string, error) {
cgroupRootLock.Lock()
defer cgroupRootLock.Unlock()
if cgroupRoot != "" {
return cgroupRoot, nil
}
// fast path
cgroupRoot = tryDefaultCgroupRoot()
if cgroupRoot != "" {
return cgroupRoot, nil
}
// slow path: parse mountinfo
mi, err := cgroups.GetCgroupMounts(false)
if err != nil {
return "", err
}
if len(mi) < 1 {
return "", errors.New("no cgroup mount found in mountinfo")
}
// Get the first cgroup mount (e.g. "/sys/fs/cgroup/memory"),
// use its parent directory.
root := filepath.Dir(mi[0].Mountpoint)
if _, err := os.Stat(root); err != nil {
return "", err
}
cgroupRoot = root
return cgroupRoot, nil
}
type cgroupData struct {
root string
innerPath string
config *configs.Cgroup
pid int
}
// isIgnorableError returns whether err is a permission error (in the loose
// sense of the word). This includes EROFS (which for an unprivileged user is
// basically a permission error) and EACCES (for similar reasons) as well as
// the normal EPERM.
func isIgnorableError(rootless bool, err error) bool {
// We do not ignore errors if we are root.
if !rootless {
return false
}
// TODO: rm errors.Cause once we switch to %w everywhere
err = errors.Cause(err)
// Is it an ordinary EPERM?
if errors.Is(err, os.ErrPermission) {
return true
}
// Handle some specific syscall errors.
var errno unix.Errno
if errors.As(err, &errno) {
return errno == unix.EROFS || errno == unix.EPERM || errno == unix.EACCES
}
return false
}
func (m *manager) Apply(pid int) (err error) {
if m.cgroups == nil {
return nil
}
m.mu.Lock()
defer m.mu.Unlock()
c := m.cgroups
if c.Resources.Unified != nil {
return cgroups.ErrV1NoUnified
}
m.paths = make(map[string]string)
if c.Paths != nil {
cgMap, err := cgroups.ParseCgroupFile("/proc/self/cgroup")
if err != nil {
return err
}
for name, path := range c.Paths {
// XXX(kolyshkin@): why this check is needed?
if _, ok := cgMap[name]; ok {
m.paths[name] = path
}
}
return cgroups.EnterPid(m.paths, pid)
}
d, err := getCgroupData(m.cgroups, pid)
if err != nil {
return err
}
for _, sys := range subsystems {
p, err := d.path(sys.Name())
if err != nil {
// The non-presence of the devices subsystem is
// considered fatal for security reasons.
if cgroups.IsNotFound(err) && (c.SkipDevices || sys.Name() != "devices") {
continue
}
return err
}
m.paths[sys.Name()] = p
if err := sys.Apply(p, d); err != nil {
// In the case of rootless (including euid=0 in userns), where an
// explicit cgroup path hasn't been set, we don't bail on error in
// case of permission problems. Cases where limits have been set
// (and we couldn't create our own cgroup) are handled by Set.
if isIgnorableError(m.rootless, err) && m.cgroups.Path == "" {
delete(m.paths, sys.Name())
continue
}
return err
}
}
return nil
}
func (m *manager) Destroy() error {
if m.cgroups == nil || m.cgroups.Paths != nil {
return nil
}
m.mu.Lock()
defer m.mu.Unlock()
return cgroups.RemovePaths(m.paths)
}
func (m *manager) Path(subsys string) string {
m.mu.Lock()
defer m.mu.Unlock()
return m.paths[subsys]
}
func (m *manager) GetStats() (*cgroups.Stats, error) {
m.mu.Lock()
defer m.mu.Unlock()
stats := cgroups.NewStats()
for _, sys := range subsystems {
path := m.paths[sys.Name()]
if path == "" {
continue
}
if err := sys.GetStats(path, stats); err != nil {
return nil, err
}
}
return stats, nil
}
func (m *manager) Set(r *configs.Resources) error {
if r == nil {
return nil
}
// If Paths are set, then we are just joining cgroups paths
// and there is no need to set any values.
if m.cgroups != nil && m.cgroups.Paths != nil {
return nil
}
if r.Unified != nil {
return cgroups.ErrV1NoUnified
}
m.mu.Lock()
defer m.mu.Unlock()
for _, sys := range subsystems {
path := m.paths[sys.Name()]
if err := sys.Set(path, r); err != nil {
if m.rootless && sys.Name() == "devices" {
continue
}
// When m.rootless is true, errors from the device subsystem are ignored because it is really not expected to work.
// However, errors from other subsystems are not ignored.
// see @test "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error"
if path == "" {
// We never created a path for this cgroup, so we cannot set
// limits for it (though we have already tried at this point).
return fmt.Errorf("cannot set %s limit: container could not join or create cgroup", sys.Name())
}
return err
}
}
return nil
}
// Freeze toggles the container's freezer cgroup depending on the state
// provided
func (m *manager) Freeze(state configs.FreezerState) error {
path := m.Path("freezer")
if m.cgroups == nil || path == "" {
return errors.New("cannot toggle freezer: cgroups not configured for container")
}
prevState := m.cgroups.Resources.Freezer
m.cgroups.Resources.Freezer = state
freezer := &FreezerGroup{}
if err := freezer.Set(path, m.cgroups.Resources); err != nil {
m.cgroups.Resources.Freezer = prevState
return err
}
return nil
}
func (m *manager) GetPids() ([]int, error) {
return cgroups.GetPids(m.Path("devices"))
}
func (m *manager) GetAllPids() ([]int, error) {
return cgroups.GetAllPids(m.Path("devices"))
}
func getCgroupData(c *configs.Cgroup, pid int) (*cgroupData, error) {
root, err := getCgroupRoot()
if err != nil {
return nil, err
}
if (c.Name != "" || c.Parent != "") && c.Path != "" {
return nil, errors.New("cgroup: either Path or Name and Parent should be used")
}
// XXX: Do not remove this code. Path safety is important! -- cyphar
cgPath := libcontainerUtils.CleanPath(c.Path)
cgParent := libcontainerUtils.CleanPath(c.Parent)
cgName := libcontainerUtils.CleanPath(c.Name)
innerPath := cgPath
if innerPath == "" {
innerPath = filepath.Join(cgParent, cgName)
}
return &cgroupData{
root: root,
innerPath: innerPath,
config: c,
pid: pid,
}, nil
}
func (raw *cgroupData) path(subsystem string) (string, error) {
// If the cgroup name/path is absolute do not look relative to the cgroup of the init process.
if filepath.IsAbs(raw.innerPath) {
mnt, err := cgroups.FindCgroupMountpoint(raw.root, subsystem)
// If we didn't mount the subsystem, there is no point we make the path.
if err != nil {
return "", err
}
// Sometimes subsystems can be mounted together as 'cpu,cpuacct'.
return filepath.Join(raw.root, filepath.Base(mnt), raw.innerPath), nil
}
// Use GetOwnCgroupPath instead of GetInitCgroupPath, because the creating
// process could in container and shared pid namespace with host, and
// /proc/1/cgroup could point to whole other world of cgroups.
parentPath, err := cgroups.GetOwnCgroupPath(subsystem)
if err != nil {
return "", err
}
return filepath.Join(parentPath, raw.innerPath), nil
}
func join(path string, pid int) error {
if path == "" {
return nil
}
if err := os.MkdirAll(path, 0o755); err != nil {
return err
}
return cgroups.WriteCgroupProc(path, pid)
}
func (m *manager) GetPaths() map[string]string {
m.mu.Lock()
defer m.mu.Unlock()
return m.paths
}
func (m *manager) GetCgroups() (*configs.Cgroup, error) {
return m.cgroups, nil
}
func (m *manager) GetFreezerState() (configs.FreezerState, error) {
dir := m.Path("freezer")
// If the container doesn't have the freezer cgroup, say it's undefined.
if dir == "" {
return configs.Undefined, nil
}
freezer := &FreezerGroup{}
return freezer.GetState(dir)
}
func (m *manager) Exists() bool {
return cgroups.PathExists(m.Path("devices"))
}
func OOMKillCount(path string) (uint64, error) {
return fscommon.GetValueByKey(path, "memory.oom_control", "oom_kill")
}
func (m *manager) OOMKillCount() (uint64, error) {
c, err := OOMKillCount(m.Path("memory"))
// Ignore ENOENT when rootless as it couldn't create cgroup.
if err != nil && m.rootless && os.IsNotExist(err) {
err = nil
}
return c, err
}

View File

@ -0,0 +1,65 @@
// +build linux
package fs
import (
"fmt"
"strconv"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
)
type HugetlbGroup struct{}
func (s *HugetlbGroup) Name() string {
return "hugetlb"
}
func (s *HugetlbGroup) Apply(path string, d *cgroupData) error {
return join(path, d.pid)
}
func (s *HugetlbGroup) Set(path string, r *configs.Resources) error {
for _, hugetlb := range r.HugetlbLimit {
if err := cgroups.WriteFile(path, "hugetlb."+hugetlb.Pagesize+".limit_in_bytes", strconv.FormatUint(hugetlb.Limit, 10)); err != nil {
return err
}
}
return nil
}
func (s *HugetlbGroup) GetStats(path string, stats *cgroups.Stats) error {
hugetlbStats := cgroups.HugetlbStats{}
if !cgroups.PathExists(path) {
return nil
}
for _, pageSize := range HugePageSizes {
usage := "hugetlb." + pageSize + ".usage_in_bytes"
value, err := fscommon.GetCgroupParamUint(path, usage)
if err != nil {
return fmt.Errorf("failed to parse %s - %v", usage, err)
}
hugetlbStats.Usage = value
maxUsage := "hugetlb." + pageSize + ".max_usage_in_bytes"
value, err = fscommon.GetCgroupParamUint(path, maxUsage)
if err != nil {
return fmt.Errorf("failed to parse %s - %v", maxUsage, err)
}
hugetlbStats.MaxUsage = value
failcnt := "hugetlb." + pageSize + ".failcnt"
value, err = fscommon.GetCgroupParamUint(path, failcnt)
if err != nil {
return fmt.Errorf("failed to parse %s - %v", failcnt, err)
}
hugetlbStats.Failcnt = value
stats.HugetlbStats[pageSize] = hugetlbStats
}
return nil
}

View File

@ -0,0 +1,352 @@
// +build linux
package fs
import (
"bufio"
"fmt"
"math"
"os"
"path/filepath"
"strconv"
"strings"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/pkg/errors"
"golang.org/x/sys/unix"
)
const (
cgroupMemorySwapLimit = "memory.memsw.limit_in_bytes"
cgroupMemoryLimit = "memory.limit_in_bytes"
cgroupMemoryUsage = "memory.usage_in_bytes"
cgroupMemoryMaxUsage = "memory.max_usage_in_bytes"
)
type MemoryGroup struct{}
func (s *MemoryGroup) Name() string {
return "memory"
}
func (s *MemoryGroup) Apply(path string, d *cgroupData) (err error) {
return join(path, d.pid)
}
func setMemory(path string, val int64) error {
if val == 0 {
return nil
}
err := cgroups.WriteFile(path, cgroupMemoryLimit, strconv.FormatInt(val, 10))
if !errors.Is(err, unix.EBUSY) {
return err
}
// EBUSY means the kernel can't set new limit as it's too low
// (lower than the current usage). Return more specific error.
usage, err := fscommon.GetCgroupParamUint(path, cgroupMemoryUsage)
if err != nil {
return err
}
max, err := fscommon.GetCgroupParamUint(path, cgroupMemoryMaxUsage)
if err != nil {
return err
}
return errors.Errorf("unable to set memory limit to %d (current usage: %d, peak usage: %d)", val, usage, max)
}
func setSwap(path string, val int64) error {
if val == 0 {
return nil
}
return cgroups.WriteFile(path, cgroupMemorySwapLimit, strconv.FormatInt(val, 10))
}
func setMemoryAndSwap(path string, r *configs.Resources) error {
// If the memory update is set to -1 and the swap is not explicitly
// set, we should also set swap to -1, it means unlimited memory.
if r.Memory == -1 && r.MemorySwap == 0 {
// Only set swap if it's enabled in kernel
if cgroups.PathExists(filepath.Join(path, cgroupMemorySwapLimit)) {
r.MemorySwap = -1
}
}
// When memory and swap memory are both set, we need to handle the cases
// for updating container.
if r.Memory != 0 && r.MemorySwap != 0 {
curLimit, err := fscommon.GetCgroupParamUint(path, cgroupMemoryLimit)
if err != nil {
return err
}
// When update memory limit, we should adapt the write sequence
// for memory and swap memory, so it won't fail because the new
// value and the old value don't fit kernel's validation.
if r.MemorySwap == -1 || curLimit < uint64(r.MemorySwap) {
if err := setSwap(path, r.MemorySwap); err != nil {
return err
}
if err := setMemory(path, r.Memory); err != nil {
return err
}
return nil
}
}
if err := setMemory(path, r.Memory); err != nil {
return err
}
if err := setSwap(path, r.MemorySwap); err != nil {
return err
}
return nil
}
func (s *MemoryGroup) Set(path string, r *configs.Resources) error {
if err := setMemoryAndSwap(path, r); err != nil {
return err
}
// ignore KernelMemory and KernelMemoryTCP
if r.MemoryReservation != 0 {
if err := cgroups.WriteFile(path, "memory.soft_limit_in_bytes", strconv.FormatInt(r.MemoryReservation, 10)); err != nil {
return err
}
}
if r.OomKillDisable {
if err := cgroups.WriteFile(path, "memory.oom_control", "1"); err != nil {
return err
}
}
if r.MemorySwappiness == nil || int64(*r.MemorySwappiness) == -1 {
return nil
} else if *r.MemorySwappiness <= 100 {
if err := cgroups.WriteFile(path, "memory.swappiness", strconv.FormatUint(*r.MemorySwappiness, 10)); err != nil {
return err
}
} else {
return fmt.Errorf("invalid value:%d. valid memory swappiness range is 0-100", *r.MemorySwappiness)
}
return nil
}
func (s *MemoryGroup) GetStats(path string, stats *cgroups.Stats) error {
// Set stats from memory.stat.
statsFile, err := cgroups.OpenFile(path, "memory.stat", os.O_RDONLY)
if err != nil {
if os.IsNotExist(err) {
return nil
}
return err
}
defer statsFile.Close()
sc := bufio.NewScanner(statsFile)
for sc.Scan() {
t, v, err := fscommon.ParseKeyValue(sc.Text())
if err != nil {
return fmt.Errorf("failed to parse memory.stat (%q) - %v", sc.Text(), err)
}
stats.MemoryStats.Stats[t] = v
}
stats.MemoryStats.Cache = stats.MemoryStats.Stats["cache"]
memoryUsage, err := getMemoryData(path, "")
if err != nil {
return err
}
stats.MemoryStats.Usage = memoryUsage
swapUsage, err := getMemoryData(path, "memsw")
if err != nil {
return err
}
stats.MemoryStats.SwapUsage = swapUsage
kernelUsage, err := getMemoryData(path, "kmem")
if err != nil {
return err
}
stats.MemoryStats.KernelUsage = kernelUsage
kernelTCPUsage, err := getMemoryData(path, "kmem.tcp")
if err != nil {
return err
}
stats.MemoryStats.KernelTCPUsage = kernelTCPUsage
value, err := fscommon.GetCgroupParamUint(path, "memory.use_hierarchy")
if err != nil {
return err
}
if value == 1 {
stats.MemoryStats.UseHierarchy = true
}
pagesByNUMA, err := getPageUsageByNUMA(path)
if err != nil {
return err
}
stats.MemoryStats.PageUsageByNUMA = pagesByNUMA
return nil
}
func getMemoryData(path, name string) (cgroups.MemoryData, error) {
memoryData := cgroups.MemoryData{}
moduleName := "memory"
if name != "" {
moduleName = "memory." + name
}
var (
usage = moduleName + ".usage_in_bytes"
maxUsage = moduleName + ".max_usage_in_bytes"
failcnt = moduleName + ".failcnt"
limit = moduleName + ".limit_in_bytes"
)
value, err := fscommon.GetCgroupParamUint(path, usage)
if err != nil {
if name != "" && os.IsNotExist(err) {
// Ignore ENOENT as swap and kmem controllers
// are optional in the kernel.
return cgroups.MemoryData{}, nil
}
return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", usage, err)
}
memoryData.Usage = value
value, err = fscommon.GetCgroupParamUint(path, maxUsage)
if err != nil {
return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", maxUsage, err)
}
memoryData.MaxUsage = value
value, err = fscommon.GetCgroupParamUint(path, failcnt)
if err != nil {
return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", failcnt, err)
}
memoryData.Failcnt = value
value, err = fscommon.GetCgroupParamUint(path, limit)
if err != nil {
return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", limit, err)
}
memoryData.Limit = value
return memoryData, nil
}
func getPageUsageByNUMA(cgroupPath string) (cgroups.PageUsageByNUMA, error) {
const (
maxColumns = math.MaxUint8 + 1
filename = "memory.numa_stat"
)
stats := cgroups.PageUsageByNUMA{}
file, err := cgroups.OpenFile(cgroupPath, filename, os.O_RDONLY)
if os.IsNotExist(err) {
return stats, nil
} else if err != nil {
return stats, err
}
defer file.Close()
// File format is documented in linux/Documentation/cgroup-v1/memory.txt
// and it looks like this:
//
// total=<total pages> N0=<node 0 pages> N1=<node 1 pages> ...
// file=<total file pages> N0=<node 0 pages> N1=<node 1 pages> ...
// anon=<total anon pages> N0=<node 0 pages> N1=<node 1 pages> ...
// unevictable=<total anon pages> N0=<node 0 pages> N1=<node 1 pages> ...
// hierarchical_<counter>=<counter pages> N0=<node 0 pages> N1=<node 1 pages> ...
scanner := bufio.NewScanner(file)
for scanner.Scan() {
var field *cgroups.PageStats
line := scanner.Text()
columns := strings.SplitN(line, " ", maxColumns)
for i, column := range columns {
byNode := strings.SplitN(column, "=", 2)
// Some custom kernels have non-standard fields, like
// numa_locality 0 0 0 0 0 0 0 0 0 0
// numa_exectime 0
if len(byNode) < 2 {
if i == 0 {
// Ignore/skip those.
break
} else {
// The first column was already validated,
// so be strict to the rest.
return stats, fmt.Errorf("malformed line %q in %s",
line, filename)
}
}
key, val := byNode[0], byNode[1]
if i == 0 { // First column: key is name, val is total.
field = getNUMAField(&stats, key)
if field == nil { // unknown field (new kernel?)
break
}
field.Total, err = strconv.ParseUint(val, 0, 64)
if err != nil {
return stats, err
}
field.Nodes = map[uint8]uint64{}
} else { // Subsequent columns: key is N<id>, val is usage.
if len(key) < 2 || key[0] != 'N' {
// This is definitely an error.
return stats, fmt.Errorf("malformed line %q in %s",
line, filename)
}
n, err := strconv.ParseUint(key[1:], 10, 8)
if err != nil {
return cgroups.PageUsageByNUMA{}, err
}
usage, err := strconv.ParseUint(val, 10, 64)
if err != nil {
return cgroups.PageUsageByNUMA{}, err
}
field.Nodes[uint8(n)] = usage
}
}
}
err = scanner.Err()
if err != nil {
return cgroups.PageUsageByNUMA{}, err
}
return stats, nil
}
func getNUMAField(stats *cgroups.PageUsageByNUMA, name string) *cgroups.PageStats {
switch name {
case "total":
return &stats.Total
case "file":
return &stats.File
case "anon":
return &stats.Anon
case "unevictable":
return &stats.Unevictable
case "hierarchical_total":
return &stats.Hierarchical.Total
case "hierarchical_file":
return &stats.Hierarchical.File
case "hierarchical_anon":
return &stats.Hierarchical.Anon
case "hierarchical_unevictable":
return &stats.Hierarchical.Unevictable
}
return nil
}

Some files were not shown because too many files have changed in this diff Show More