diff --git a/docker-compose.rocm.yml b/docker-compose.rocm.yml new file mode 100644 index 00000000..c022ecab --- /dev/null +++ b/docker-compose.rocm.yml @@ -0,0 +1,31 @@ +version: "3.9" +services: + refact_self_hosted: + # TODO: figureout how to pass gpu to docker builds, so there is no need to install deepspeed at runtime + command: > + /bin/bash -c 'pip install deepspeed --no-cache-dir + && python -m self_hosting_machinery.watchdog.docker_watchdog' + image: refact_self_hosting_rocm + build: + dockerfile: rocm.Dockerfile + shm_size: "32gb" + devices: + - "/dev/kfd" + - "/dev/dri" + group_add: + - "video" + security_opt: + - seccomp:unconfined + volumes: + - perm_storage:/perm_storage + ports: + - 8008:8008 + nginx: + image: nginx + ports: + - "80:80" + volumes: + - ./nginx.conf:/etc/nginx/conf.d/default.conf:ro + +volumes: + perm_storage: diff --git a/rocm.Dockerfile b/rocm.Dockerfile new file mode 100644 index 00000000..cbe84453 --- /dev/null +++ b/rocm.Dockerfile @@ -0,0 +1,68 @@ +FROM ocelot88/rocm-pytorch-slim:rocm-5.7.1-dev-torch-2.3 +RUN apt-get update +RUN DEBIAN_FRONTEND="noninteractive" apt-get install -y \ + curl \ + git \ + htop \ + tmux \ + file \ + vim \ + expect \ + mpich \ + libmpich-dev \ + python3 python3-pip \ + && rm -rf /var/lib/{apt,dpkg,cache,log} + + +RUN update-alternatives --install /usr/bin/python python /usr/bin/python3 1 + +# linguist requisites +RUN apt-get update +RUN DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt-get install -y \ + expect \ + ruby-full \ + ruby-bundler \ + build-essential \ + cmake \ + pkg-config \ + libicu-dev \ + zlib1g-dev \ + libcurl4-openssl-dev \ + libssl-dev +RUN git clone https://github.com/smallcloudai/linguist.git /tmp/linguist \ + && cd /tmp/linguist \ + && bundle install \ + && rake build_gem + +ENV PATH="${PATH}:/tmp/linguist/bin" + +RUN DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt-get install -y python3-packaging + +ENV INSTALL_OPTIONAL=TRUE +ENV BUILD_CUDA_EXT=1 +ENV USE_ROCM=1 +ENV GITHUB_ACTIONS=true +ENV AMDGPU_TARGETS="gfx1030" +ENV FLASH_ATTENTION_FORCE_BUILD=TRUE +ENV MAX_JOBS=8 +COPY . /tmp/app +RUN pip install --upgrade pip ninja packaging +RUN DEBIAN_FRONTEND=noninteractive apt-get install python3-mpi4py -y +ENV PYTORCH_ROCM_ARCH="gfx1030" +ENV ROCM_TARGET="gfx1030" +ENV ROCM_HOME=/opt/rocm-5.7.1 +# TODO: https://github.com/TimDettmers/bitsandbytes/pull/756 remove this layer, when this pr merged +RUN git clone https://github.com/arlo-phoenix/bitsandbytes-rocm-5.6 && \ + cd bitsandbytes-rocm-5.6 && \ + make hip && pip install . && \ + cd .. && rm -rf bitsandbytes-rocm-5.6 +RUN pip install /tmp/app -v --no-build-isolation && rm -rf /tmp/app +RUN ln -s ${ROCM_HOME} /opt/rocm +ENV REFACT_PERM_DIR "/perm_storage" +ENV REFACT_TMP_DIR "/tmp" +ENV RDMAV_FORK_SAFE 0 +ENV RDMAV_HUGEPAGES_SAFE 0 + +EXPOSE 8008 + +CMD ["python", "-m", "self_hosting_machinery.watchdog.docker_watchdog"] diff --git a/self_hosting_machinery/scripts/enum_gpus.py b/self_hosting_machinery/scripts/enum_gpus.py index faa3504a..620b9acc 100644 --- a/self_hosting_machinery/scripts/enum_gpus.py +++ b/self_hosting_machinery/scripts/enum_gpus.py @@ -9,6 +9,38 @@ from self_hosting_machinery import env +def query_rocm_smi(): + rocm_smi_output = "- no output -" + descriptions = [] + try: + rocm_smi_output = subprocess.check_output([ + "/opt/rocm/bin/rocm-smi", + "--showbus", + "--showproductname", + "--showtemp", + "--showmeminfo", "vram", + "--json"]) + logging.info(rocm_smi_output) + smi_output_dict = json.loads(rocm_smi_output) + for gpu_id, props in smi_output_dict.items(): + descriptions.append({ + "id": props.get("PCI Bus"), + "name": props.get("Card model", "AMD GPU"), + "mem_used_mb": bytes_to_mb(int(props.get("VRAM Total Used Memory (B)", 0))), + "mem_total_mb": bytes_to_mb(int(props.get("VRAM Total Memory (B)", 0 ))), + "temp_celsius": props.get("Temperature (Sensor junction) (C)", -1), + }) + except Exception: + logging.warning("rocm-smi does not work, that's especially bad for initial setup.") + logging.warning(traceback.format_exc()) + logging.warning(f"output was:\n{smi_output_dict}") + + return {"gpus": descriptions} + +def bytes_to_mb(bytes_size): + mb_size = bytes_size / (1024 ** 2) + return mb_size + def query_nvidia_smi(): nvidia_smi_output = "- no output -" @@ -42,7 +74,10 @@ def query_nvidia_smi(): def enum_gpus(): - result = query_nvidia_smi() + if os.environ.get('USE_ROCM'): + result = query_rocm_smi() + else: + result = query_nvidia_smi() with open(env.CONFIG_ENUM_GPUS + ".tmp", 'w') as f: json.dump(result, f, indent=4) os.rename(env.CONFIG_ENUM_GPUS + ".tmp", env.CONFIG_ENUM_GPUS) diff --git a/setup.py b/setup.py index db73839a..6f89ceb6 100644 --- a/setup.py +++ b/setup.py @@ -9,6 +9,7 @@ setup_package = os.environ.get("SETUP_PACKAGE", None) install_optional = os.environ.get("INSTALL_OPTIONAL", "FALSE") +use_rocm = os.environ.get("USE_ROCM", "FALSE") @dataclass @@ -44,12 +45,24 @@ class PyPackage: "self_hosting_machinery": PyPackage( requires=["aiohttp", "aiofiles", "cryptography", "fastapi==0.100.0", "giturlparse", "pydantic==1.10.13", "starlette==0.27.0", "uvicorn", "uvloop", "python-multipart", "auto-gptq==0.4.2", "accelerate", - "termcolor", "torch", "transformers==4.34.0", "bitsandbytes", "safetensors", "peft", "triton", - "torchinfo", "mpi4py", "deepspeed==0.11.1"], - optional=["ninja", "flash_attn @ git+https://github.com/smallcloudai/flash-attention@feat/alibi"], + "termcolor", "torch", "transformers==4.34.0", "bitsandbytes", "safetensors", "peft", + "torchinfo"], + optional=["ninja"], requires_packages=["refact_scratchpads", "refact_scratchpads_no_gpu", "known_models_db", "refact_data_pipeline"], data=["webgui/static/*", "webgui/static/js/*", "webgui/static/components/modals/*", "watchdog/watchdog.d/*"]), + "rocm": PyPackage( + requires=[ + # "bitsandbytes", # TODO: bitsandbytes still dont have support for the ROCm, so we build it from sources, see: https://github.com/TimDettmers/bitsandbytes/pull/756 + # "deepspeed", # TODO: figure out how to install deepspeed at build time, see: docker-compose.rocm.yaml + # "flash_attn", # TODO: flash_attn has support limited support for GPUs, see: https://github.com/ROCmSoftwarePlatform/flash-attention/tree/flash_attention_for_rocm2 + "pytorch-triton-rocm", + ] + ), + "cuda": PyPackage( + requires=["mpi4py", "deepspeed==0.11.1", "triton"], + optional=["flash_attn @ git+https://github.com/smallcloudai/flash-attention@feat/alibi"], + ), } @@ -66,17 +79,30 @@ def find_required_packages(packages: Set[str]) -> Set[str]: def get_install_requires(packages): install_requires = list({ required_package - for py_package in packages.values() + for key, py_package in packages.items() for required_package in py_package.requires + if key not in ("rocm", "cuda") }) if install_optional.upper() == "TRUE": install_requires.extend(list({ required_package - for py_package in packages.values() + for key, py_package in packages.items() for required_package in py_package.optional + if key not in ("rocm", "cuda") })) + install_requires.extend(get_runtime_dependent_dependencies(packages)) return install_requires +def get_runtime_dependent_dependencies(packages): + required = [] + runtime_key = "rocm" if use_rocm else "cuda" + if use_rocm: + required.extend(package for package in packages.get(runtime_key).requires) + if install_optional.upper() == "TRUE": + required.extend(package for package in packages.get(runtime_key).optional) + return required + + if setup_package is not None: if setup_package not in all_refact_packages: