Skip to content

Commit

Permalink
Re-org python module (#177)
Browse files Browse the repository at this point in the history
* re-org python module

* update import path

* run _load_clib('libkungfu') manually

* cleanup scripts

* remove RPATH

* fix import
  • Loading branch information
lgarithm authored and luomai committed Oct 26, 2019
1 parent b24c52e commit ba9ab1e
Show file tree
Hide file tree
Showing 25 changed files with 74 additions and 177 deletions.
3 changes: 0 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ ENDIF()

IF(KUNGFU_BUILD_TF_OPS)
INCLUDE(cmake/tf-op.cmake)
INCLUDE(cmake/rpath.cmake)

ADD_LIBRARY(kungfu_python SHARED srcs/cpp/src/python/init.cpp)
TARGET_LINK_LIBRARIES(kungfu_python kungfu)
Expand Down Expand Up @@ -61,8 +60,6 @@ IF(KUNGFU_BUILD_TF_OPS)
ENDIF()

SET_TF_CXX11_ABI(kungfu_python)
SET_INSTALL_RPATH(kungfu_python)
SET_INSTALL_RPATH(kungfu_tensorflow_ops)
ENDIF()

IF(KUNGFU_BUILD_TOOLS)
Expand Down
8 changes: 0 additions & 8 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,11 +60,3 @@ KungFu can use [NCCL](https://developer.nvidia.com/nccl) to leverage GPU-GPU dir

KUNGFU_ENABLE_NCCL=1 pip3 install .
```

## Known issues

For Mac users, the following is required if you run the program without ``kungfu-run``:

```bash
export DYLD_LIBRARY_PATH=$(python3 -c "import os; import kungfu; print(os.path.dirname(kungfu.__file__))")
```
4 changes: 0 additions & 4 deletions benchmarks/adaptation/bench-adaptation.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,6 @@ kungfu_run() {
$@
}

if [ $(uname -s) = "Darwin" ]; then
export DYLD_LIBRARY_PATH=$(python3 -c "import os; import kungfu; print(os.path.dirname(kungfu.__file__))")
fi

TF_CPP_MIN_LOG_LEVEL=2

kungfu_run 2 python3 adaptive_trainer.py
Expand Down
18 changes: 0 additions & 18 deletions cmake/rpath.cmake

This file was deleted.

4 changes: 2 additions & 2 deletions examples/mnist_keras.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@

import kungfu as kf
import tensorflow as tf
from kungfu.tensorflow.v1.ops import (broadcast, current_cluster_size,
current_rank)
from kungfu import current_cluster_size, current_rank
from kungfu.tensorflow.v1.ops import broadcast


class InitalizationCallback(tf.keras.callbacks.Callback):
Expand Down
2 changes: 1 addition & 1 deletion examples/mnist_slp.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@
import kungfu as kf
import numpy as np
import tensorflow as tf
from kungfu import current_cluster_size, current_rank
from kungfu.tensorflow.v1.helpers.mnist import load_datasets
from kungfu.tensorflow.v1.ops import current_cluster_size, current_rank


# TODO add an explaination what the function does
Expand Down
4 changes: 0 additions & 4 deletions experimental/atari_pingpong/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,6 @@ set -e
cd $(dirname $0)
ROOT=$(cd ../../.. && pwd)

if [ $(uname -s) = "Darwin" ]; then
export DYLD_LIBRARY_PATH=$(python3 -c "import os; import kungfu; print(os.path.dirname(kungfu.__file__))")
fi

single_train() {
export PYTHONUNBUFFERED=1
python3 ./pingpong.py
Expand Down
51 changes: 0 additions & 51 deletions scripts/parallel-train.sh

This file was deleted.

4 changes: 0 additions & 4 deletions scripts/tests/run-fake-trainer.sh
Original file line number Diff line number Diff line change
Expand Up @@ -78,10 +78,6 @@ run_fake_tf_trainer() {
local np=$1
local H=127.0.0.1:$np

if [ $(uname -s) = "Darwin" ]; then
export DYLD_LIBRARY_PATH=$(python3 -c "import os; import kungfu; print(os.path.dirname(kungfu.__file__))")
fi

env \
KUNGFU_TEST_CLUSTER_SIZE=$np \
${KUNGFU_RUN} \
Expand Down
4 changes: 0 additions & 4 deletions scripts/tests/run-train-tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,6 @@ ensure_kungfu_run() {
ensure_kungfu_run
export KUNGFU_CONFIG_LOG_CONFIG_VARS=true

if [ $(uname -s) = "Darwin" ]; then
export DYLD_LIBRARY_PATH=$(${PYTHON} -c "import os; import kungfu; print(os.path.dirname(kungfu.__file__))")
fi

SCRIPT=${ROOT}/tests/python/test_mnist_slp.py

epochs=2
Expand Down
18 changes: 0 additions & 18 deletions srcs/go/scheduler/env.go

This file was deleted.

1 change: 0 additions & 1 deletion srcs/go/scheduler/scheduler.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ func (jc JobConfig) NewProc(peer plan.PeerID, localRank int, checkpoint string,
kb.AllReduceStrategyEnvKey: jc.Strategy.String(),
}
allEnvs := merge(getConfigEnvs(), envs)
allEnvs.addIfMissing(`DYLD_LIBRARY_PATH`, DefaultLdLibraryPath)
allEnvs.addIfMissing(`PYTHONUNBUFFERED`, `1`)
var pubAddr string
for _, h := range jc.HostList {
Expand Down
1 change: 1 addition & 0 deletions srcs/python/kungfu/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .ext import current_cluster_size, current_rank, run_barrier
30 changes: 30 additions & 0 deletions srcs/python/kungfu/ext.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from .loader import _call_method, _load_clib, _module_path


def _load_and_init_python_lib():
_load_clib('libkungfu')
_python_lib = _load_clib('libkungfu_python')
_call_method(_python_lib, 'kungfu_python_init')
has_gpu = _call_method(_python_lib, 'kungfu_python_init_gpu')
return _python_lib, has_gpu


_python_lib, _has_gpu = _load_and_init_python_lib()


def current_rank():
return _python_lib.kungfu_rank()


def current_cluster_size():
return _python_lib.kungfu_cluster_size()


def run_barrier():
_python_lib.kungfu_barrier()


def _get_other_ranks():
self_rank = current_rank()
ranks = list(range(current_cluster_size()))
return [r for r in ranks if r != self_rank]
7 changes: 7 additions & 0 deletions srcs/python/kungfu/config.py → srcs/python/kungfu/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,10 @@ def _load_clib(name):
suffix = 'so' if platform.uname()[0] != 'Darwin' else 'dylib'
filename = os.path.join(_module_path(), name + '.' + suffix)
return cdll.LoadLibrary(filename)


def _call_method(lib, name):
if hasattr(lib, name):
getattr(lib, name)()
return True
return False
8 changes: 5 additions & 3 deletions srcs/python/kungfu/tensorflow/v1/ops/__init__.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
from kungfu.ext import (_get_other_ranks, _has_gpu, current_cluster_size,
current_rank, run_barrier)

from ._tf_oplib import _op_lib
from .adapt import get_init_checkpoint, resize_cluster
from .collective import (all_reduce, barrier, broadcast, group_all_reduce,
group_nccl_all_reduce)
from .loader import _has_gpu, _op_lib, _python_lib, run_barrier
from .local import save_variable, save_variables
from .monitor import global_noise_scale
from .p2p import request_variable, request_variable_with_template
from .topology import (_get_other_ranks, current_cluster_size, current_rank,
peer_info)
from .topology import peer_info


def _tensor_size(t):
Expand Down
21 changes: 21 additions & 0 deletions srcs/python/kungfu/tensorflow/v1/ops/_tf_oplib.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import os
import sysconfig

from kungfu.loader import _module_path

EXT_SUFFIX_KEY = 'SO' # 'EXT_SUFFIX' does't work for python2


def _load_op_lib(name):
suffix = sysconfig.get_config_var(EXT_SUFFIX_KEY)
filename = os.path.join(_module_path(), name + suffix)
import tensorflow as tf
return tf.load_op_library(filename)


def _load_and_init_op_lib():
_op_lib = _load_op_lib('kungfu_tensorflow_ops')
return _op_lib


_op_lib = _load_and_init_op_lib()
2 changes: 1 addition & 1 deletion srcs/python/kungfu/tensorflow/v1/ops/adapt.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import os

from .loader import _op_lib, _python_lib
from ._tf_oplib import _op_lib


def get_init_checkpoint():
Expand Down
2 changes: 1 addition & 1 deletion srcs/python/kungfu/tensorflow/v1/ops/collective.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .loader import _op_lib
from ._tf_oplib import _op_lib
from .topology import peer_info


Expand Down
35 changes: 0 additions & 35 deletions srcs/python/kungfu/tensorflow/v1/ops/loader.py

This file was deleted.

2 changes: 1 addition & 1 deletion srcs/python/kungfu/tensorflow/v1/ops/local.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .loader import _op_lib
from ._tf_oplib import _op_lib


def save_variable(t, version=None):
Expand Down
2 changes: 1 addition & 1 deletion srcs/python/kungfu/tensorflow/v1/ops/monitor.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .loader import _op_lib
from ._tf_oplib import _op_lib


def global_noise_scale(batch_small, batch_big, tensor, avg_tensor, alpha=0.6):
Expand Down
2 changes: 1 addition & 1 deletion srcs/python/kungfu/tensorflow/v1/ops/p2p.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .loader import _op_lib
from ._tf_oplib import _op_lib


def request_variable(target, version=None, name=None, shape=None, dtype=None):
Expand Down
16 changes: 1 addition & 15 deletions srcs/python/kungfu/tensorflow/v1/ops/topology.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,4 @@
from .loader import _op_lib, _python_lib


def current_rank():
return _python_lib.kungfu_rank()


def current_cluster_size():
return _python_lib.kungfu_cluster_size()


def _get_other_ranks():
self_rank = current_rank()
ranks = list(range(current_cluster_size()))
return [r for r in ranks if r != self_rank]
from ._tf_oplib import _op_lib


def peer_info():
Expand Down
2 changes: 1 addition & 1 deletion tests/python/test_python_apis.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# FIXME: make sure it runs without tensorflow
from kungfu.tensorflow.v1.ops import run_barrier, current_cluster_size, current_rank
from kungfu import run_barrier, current_cluster_size, current_rank


def test_barrier():
Expand Down

0 comments on commit ba9ab1e

Please sign in to comment.