Skip to content

Commit

Permalink
Merge branch 'main' into usama.saqib/add-ubuntu-24.04
Browse files Browse the repository at this point in the history
  • Loading branch information
usamasaqib committed Nov 4, 2024
2 parents b7ae254 + 3e1c030 commit b23c3f5
Show file tree
Hide file tree
Showing 26 changed files with 838 additions and 34 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/buildimages-update.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ jobs:
ref: ${{ inputs.branch }}

- name: Setup Python and pip
uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
with:
# use Python < 3.12 so that distutil is still available by default
python-version: 3.11
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/chase_release_managers.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ jobs:
with:
ref: ${{ github.head_ref }}
- name: Install python
uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
with:
python-version: 3.11
cache: "pip"
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/code_review_complexity.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ jobs:
- name: Checkout repository
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4.1.4
- name: Setup python
uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
with:
python-version: 3.12
cache: 'pip'
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/codeql-analysis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ jobs:
fetch-depth: 0

- name: Setup Python3
uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
with:
python-version: "3.12.6"
cache: "pip"
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/create_rc_pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ jobs:
sparse-checkout: 'tasks'

- name: Install python
uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
with:
python-version: 3.11
cache: "pip"
Expand Down Expand Up @@ -64,7 +64,7 @@ jobs:
fetch-depth: 0

- name: Install python
uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
with:
python-version: 3.11
cache: "pip"
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/create_release_schedule.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ jobs:
with:
ref: ${{ github.head_ref }}
- name: Install python
uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
with:
python-version: 3.11
cache: "pip"
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/cws-btfhub-sync.yml
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ jobs:
sparse-checkout: ${{ matrix.cone }}

- name: Install python
uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
with:
python-version: 3.11
cache: 'pip'
Expand Down Expand Up @@ -106,7 +106,7 @@ jobs:
ref: ${{ inputs.base_branch || 'main' }}

- name: Install python
uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
with:
python-version: 3.11
cache: 'pip'
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/docs-dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ jobs:
fetch-depth: 0

- name: Set up Python
uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
with:
python-version: '3.12'

Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/external-contributor.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ jobs:
ref: main
fetch-depth: 0
- name: Setup python
uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
with:
python-version: 3.11
cache: 'pip'
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/go_mod_tidy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ jobs:
with:
go-version-file: ".go-version"
- name: Install python
uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
with:
python-version: 3.11
cache: "pip"
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/label-analysis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ jobs:
- name: Checkout repository
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4.1.4
- name: Setup python
uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
with:
python-version: 3.11
cache: 'pip'
Expand All @@ -45,7 +45,7 @@ jobs:
with:
fetch-depth: 0
- name: Setup python
uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
with:
python-version: 3.11
cache: 'pip'
Expand Down
31 changes: 31 additions & 0 deletions .github/workflows/report-merged-pr.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# Report PR merged event to Datadog

name: Report Merged PR

on:
pull_request:
types: closed

permissions: {}

jobs:
if_merged:
if: github.event.pull_request.merged == true
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4.1.4

- name: Setup Python3
uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
with:
python-version: "3.12.6"
cache: "pip"
cache-dependency-path: '**/requirements*.txt'

- name: Install python dependencies
run: pip3 install -r requirements.txt

- name: Send merge event to Datadog
run: |
invoke -e github.pr-merge-dd-event-sender -p ${{ github.event.pull_request.number }}
1 change: 1 addition & 0 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -304,6 +304,7 @@ variables:
MACOS_GITHUB_APP_1: macos-github-app-one # agent-devx-infra
MACOS_GITHUB_APP_2: macos-github-app-two # agent-devx-infra
SLACK_AGENT: slack-agent-ci # agent-devx-infra
SMP_ACCOUNT: smp # single-machine-performance
# End vault variables

DD_PKG_VERSION: "latest"
Expand Down
10 changes: 5 additions & 5 deletions .gitlab/functional_test/regression_detector.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,12 +44,12 @@ single-machine-performance-regression_detector:
- echo "Merge base is ${SMP_MERGE_BASE}"
# Setup AWS credentials for single-machine-performance AWS account
- AWS_NAMED_PROFILE="single-machine-performance"
- SMP_ACCOUNT_ID=$($CI_PROJECT_DIR/tools/ci/fetch_secret.sh $SMP_ACCOUNT_ID) || exit $?
- SMP_ACCOUNT_ID=$($CI_PROJECT_DIR/tools/ci/fetch_secret.sh $SMP_ACCOUNT account_id) || exit $?
- SMP_ECR_URL=${SMP_ACCOUNT_ID}.dkr.ecr.us-west-2.amazonaws.com
- SMP_AGENT_TEAM_ID=$($CI_PROJECT_DIR/tools/ci/fetch_secret.sh $SMP_AGENT_TEAM_ID) || exit $?
- SMP_API=$($CI_PROJECT_DIR/tools/ci/fetch_secret.sh $SMP_API) || exit $?
- SMP_BOT_ID=$($CI_PROJECT_DIR/tools/ci/fetch_secret.sh $SMP_BOT_ACCESS_KEY_ID) || exit $?
- SMP_BOT_KEY=$($CI_PROJECT_DIR/tools/ci/fetch_secret.sh $SMP_BOT_ACCESS_KEY) || exit $?
- SMP_AGENT_TEAM_ID=$($CI_PROJECT_DIR/tools/ci/fetch_secret.sh $SMP_ACCOUNT agent_team_id) || exit $?
- SMP_API=$($CI_PROJECT_DIR/tools/ci/fetch_secret.sh $SMP_ACCOUNT api_url) || exit $?
- SMP_BOT_ID=$($CI_PROJECT_DIR/tools/ci/fetch_secret.sh $SMP_ACCOUNT bot_login) || exit $?
- SMP_BOT_KEY=$($CI_PROJECT_DIR/tools/ci/fetch_secret.sh $SMP_ACCOUNT bot_token) || exit $?
- aws configure set aws_access_key_id "$SMP_BOT_ID" --profile ${AWS_NAMED_PROFILE}
- aws configure set aws_secret_access_key "$SMP_BOT_KEY" --profile ${AWS_NAMED_PROFILE}
- aws configure set region us-west-2 --profile ${AWS_NAMED_PROFILE}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,6 @@ internal_kubernetes_deploy_experimental:
artifacts: false
- job: docker_trigger_cluster_agent_internal
artifacts: false
- job: deploy_packages_windows-x64-7
artifacts: false
- job: k8s-e2e-main # Currently only require container Argo workflow
artifacts: false
optional: true
Expand Down
1 change: 1 addition & 0 deletions comp/trace/agent/impl/run.go
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,7 @@ func profilingConfig(tracecfg *tracecfg.AgentConfig) *profiling.Settings {
WithGoroutineProfile: pkgconfigsetup.Datadog().GetBool("internal_profiling.enable_goroutine_stacktraces"),
WithBlockProfile: pkgconfigsetup.Datadog().GetBool("internal_profiling.enable_block_profiling"),
WithMutexProfile: pkgconfigsetup.Datadog().GetBool("internal_profiling.enable_mutex_profiling"),
WithDeltaProfiles: pkgconfigsetup.Datadog().GetBool("internal_profiling.delta_profiles"),
Tags: tags,
}
}
Expand Down
126 changes: 126 additions & 0 deletions pkg/gpu/cuda/env.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
// Unless explicitly stated otherwise all files in this repository are licensed
// under the Apache License Version 2.0.
// This product includes software developed at Datadog (https://www.datadoghq.com/).
// Copyright 2024-present Datadog, Inc.

//go:build linux_bpf

package cuda

import (
"fmt"
"strconv"
"strings"

"github.com/NVIDIA/go-nvml/pkg/nvml"

"github.com/DataDog/datadog-agent/pkg/util/kernel"
)

const cudaVisibleDevicesEnvVar = "CUDA_VISIBLE_DEVICES"

// GetVisibleDevicesForProcess modifies the list of GPU devices according to the
// value of the CUDA_VISIBLE_DEVICES environment variable for the specified
// process. Reference:
// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars.
//
// As a summary, the CUDA_VISIBLE_DEVICES environment variable should be a comma
// separated list of GPU identifiers. These can be either the index of the GPU
// (0, 1, 2) or the UUID of the GPU (GPU-<UUID>, or
// MIG-GPU-<UUID>/<instance-index>/<compute-index for multi-instance GPUs). UUID
// identifiers do not need to be the full UUID, it is enough with specifying the
// prefix that uniquely identifies the GPU.
//
// Invalid device indexes are ignored, and anything that comes after that is
// invisible, following the spec: "If one of the indices is invalid, only the
// devices whose index precedes the invalid index are visible to CUDA
// applications." If an invalid index is found, an error is returned together
// with the list of valid devices found up until that point.
func GetVisibleDevicesForProcess(systemDevices []nvml.Device, pid int, procfs string) ([]nvml.Device, error) {
cudaVisibleDevices, err := kernel.GetProcessEnvVariable(pid, procfs, cudaVisibleDevicesEnvVar)
if err != nil {
return nil, fmt.Errorf("cannot get env var %s for process %d: %w", cudaVisibleDevicesEnvVar, pid, err)
}

return getVisibleDevices(systemDevices, cudaVisibleDevices)
}

// getVisibleDevices processes the list of GPU devices according to the value of
// the CUDA_VISIBLE_DEVICES environment variable
func getVisibleDevices(systemDevices []nvml.Device, cudaVisibleDevices string) ([]nvml.Device, error) {
if cudaVisibleDevices == "" {
return systemDevices, nil
}

var filteredDevices []nvml.Device
visibleDevicesList := strings.Split(cudaVisibleDevices, ",")

for _, visibleDevice := range visibleDevicesList {
var matchingDevice nvml.Device
var err error
switch {
case strings.HasPrefix(visibleDevice, "GPU-"):
matchingDevice, err = getDeviceWithMatchingUUIDPrefix(systemDevices, visibleDevice)
if err != nil {
return filteredDevices, err
}
case strings.HasPrefix(visibleDevice, "MIG-GPU"):
// MIG (Multi Instance GPUs) devices require extra parsing and data
// about the MIG instance assignment, which is not supported yet.
return filteredDevices, fmt.Errorf("MIG devices are not supported")
default:
matchingDevice, err = getDeviceWithIndex(systemDevices, visibleDevice)
if err != nil {
return filteredDevices, err
}
}

filteredDevices = append(filteredDevices, matchingDevice)
}

return filteredDevices, nil
}

// getDeviceWithMatchingUUIDPrefix returns the first device with a UUID that
// matches the given prefix. If there are multiple devices with the same prefix
// or the device is not found, an error is returned.
func getDeviceWithMatchingUUIDPrefix(systemDevices []nvml.Device, uuidPrefix string) (nvml.Device, error) {
var matchingDevice nvml.Device
var matchingDeviceUUID string

for _, device := range systemDevices {
uuid, ret := device.GetUUID()
if ret != nvml.SUCCESS {
return nil, fmt.Errorf("cannot get UUID for device: %s", nvml.ErrorString(ret))
}

if strings.HasPrefix(uuid, uuidPrefix) {
if matchingDevice != nil {
return nil, fmt.Errorf("non-unique UUID prefix %s, found UUIDs %s and %s", uuidPrefix, matchingDeviceUUID, uuid)
}
matchingDevice = device
matchingDeviceUUID = uuid
}
}

if matchingDevice == nil {
return nil, fmt.Errorf("device with UUID prefix %s not found", uuidPrefix)
}

return matchingDevice, nil
}

// getDeviceWithIndex returns the device with the given index. If the index is
// out of range or the index is not a number, an error is returned.
func getDeviceWithIndex(systemDevices []nvml.Device, visibleDevice string) (nvml.Device, error) {
idx, err := strconv.Atoi(visibleDevice)
if err != nil {
return nil, fmt.Errorf("invalid device index %s: %w", visibleDevice, err)
}

if idx < 0 || idx >= len(systemDevices) {
return nil, fmt.Errorf("device index %d is out of range [0, %d]", idx, len(systemDevices)-1)
}

return systemDevices[idx], nil
}
Loading

0 comments on commit b23c3f5

Please sign in to comment.