diff --git a/examples/notebooks/ogbn_mag_e2e.ipynb b/examples/notebooks/ogbn_mag_e2e.ipynb index 9e8c5335..ea124352 100644 --- a/examples/notebooks/ogbn_mag_e2e.ipynb +++ b/examples/notebooks/ogbn_mag_e2e.ipynb @@ -101,15 +101,25 @@ "colab": { "base_uri": "https://localhost:8080/" }, + "executionInfo": { + "elapsed": 10039, + "status": "ok", + "timestamp": 1711472628551, + "user": { + "displayName": "", + "userId": "" + }, + "user_tz": -60 + }, "id": "oA4_zh0EyNHv", - "outputId": "4e3e16b7-64dd-4516-99da-8cea252750d8" + "outputId": "8b415cad-86b7-4169-cc9f-2dd9f6b02f2c" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Running TF-GNN 1.0.2 under TensorFlow 2.12.0.\n" + "Running TF-GNN 1.0.2 under TensorFlow 2.15.0.\n" ] } ], @@ -435,10 +445,9 @@ "source": [ "## Distributed Training\n", "\n", - "\n", - "\n", "We use TensorFlow's [Distribution Strategy](https://www.tensorflow.org/guide/distributed_training) API to write a model that can run on multiple TPUs, multiple GPUs, or maybe just locally on CPU.\n", - "\n" + "\n", + "For CloudTPU, the following code assumes the Colab runtime type \"TPU v2\", that is, a TPU VM. Do not use the runtime type \"TPU (deprecated)\", which uses a TPU Node on a separate VM." ] }, { @@ -448,31 +457,34 @@ "colab": { "base_uri": "https://localhost:8080/" }, + "executionInfo": { + "elapsed": 26820, + "status": "ok", + "timestamp": 1711472717800, + "user": { + "displayName": "", + "userId": "" + }, + "user_tz": -60 + }, "id": "2oBuJEZ3izQm", - "outputId": "680d981b-9ee6-4ffe-d696-c70110edadca" + "outputId": "db98cb52-837a-4552-cf63-606cb88ffa25" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Running on TPU ['10.116.185.2:8470']\n", "Using TPUStrategy\n", "Found 8 replicas in sync\n" ] } ], "source": [ - "try:\n", - " tpu_resolver = tf.distribute.cluster_resolver.TPUClusterResolver()\n", - " print(\"Running on TPU \", tpu_resolver.cluster_spec().as_dict()[\"worker\"])\n", - "except:\n", - " tpu_resolver = None\n", - "\n", - "if tpu_resolver:\n", - " print(\"Using TPUStrategy\")\n", + "if tf.config.list_physical_devices(\"TPU\"):\n", + " print(f\"Using TPUStrategy\")\n", " min_nodes_per_component = {\"paper\": 1}\n", - " strategy = runner.TPUStrategy()\n", + " strategy = runner.TPUStrategy(\"local\")\n", " train_padding = runner.FitOrSkipPadding(example_input_graph_spec, train_ds_provider, min_nodes_per_component)\n", " valid_padding = runner.TightPadding(example_input_graph_spec, valid_ds_provider, min_nodes_per_component)\n", "elif tf.config.list_physical_devices(\"GPU\"):\n", @@ -846,8 +858,8 @@ "global_batch_size = 128\n", "epochs = 10\n", "initial_learning_rate = 0.001\n", - "if tpu_resolver:\n", - " # Training on TPU takes ~90 secs / epoch, so we train for the entire epoch.\n", + "if tf.config.list_physical_devices(\"TPU\"):\n", + " # Training on TPU takes ~130 secs / epoch, so we train for the entire epoch.\n", " epoch_divisor = 1\n", "else:\n", " # Training on GPU / CPU is slower, so we train for 1/100th of a true epoch.\n", @@ -882,9 +894,7 @@ "source": [ "## Export options for inference\n", "\n", - "For inference, a SavedModel must be exported by the runner at the end of training. C++ inference environments like TF Serving do not support input of extension types like GraphTensor, so the `KerasModelExporter` exports the model with a SavedModel Signature that accepts a batch of serialized tf.Examples and preprocesses them like training did.\n", - "\n", - "Note: After connecting this Colab to a TPU worker, explicit device placements are necessary to do the test on the colab host (which has the `/tmp/gnn_model` directory)." + "For inference, a SavedModel must be exported by the runner at the end of training. C++ inference environments like TF Serving do not support input of extension types like GraphTensor, so the `KerasModelExporter` exports the model with a SavedModel Signature that accepts a batch of serialized tf.Examples and preprocesses them like training did." ] }, { @@ -895,9 +905,7 @@ }, "outputs": [], "source": [ - "save_options = tf.saved_model.SaveOptions(experimental_io_device=\"/job:localhost\")\n", - "model_exporter = runner.KerasModelExporter(output_names=\"paper_venue_logits\",\n", - " options=save_options)" + "model_exporter = runner.KerasModelExporter(output_names=\"paper_venue_logits\")" ] }, { @@ -924,8 +932,18 @@ "colab": { "base_uri": "https://localhost:8080/" }, + "executionInfo": { + "elapsed": 427499, + "status": "ok", + "timestamp": 1711474246342, + "user": { + "displayName": "", + "userId": "" + }, + "user_tz": -60 + }, "id": "Ay2hhL3d0dZz", - "outputId": "09663b5e-8a98-4753-f900-c24e56f054c1" + "outputId": "70fa9f6c-a2c5-4bfa-a0ef-c8ad295c50cb" }, "outputs": [ { @@ -933,38 +951,31 @@ "output_type": "stream", "text": [ "Epoch 1/10\n", - "4918/4918 [==============================] - 142s 29ms/step - loss: 2.6329 - sparse_categorical_accuracy: 0.3213 - sparse_categorical_crossentropy: 2.7456 - val_loss: 2.1140 - val_sparse_categorical_accuracy: 0.4131 - val_sparse_categorical_crossentropy: 2.1837\n", + "4918/4918 [==============================] - 170s 34ms/step - loss: 2.6135 - sparse_categorical_accuracy: 0.3245 - sparse_categorical_crossentropy: 2.7248 - val_loss: 2.0817 - val_sparse_categorical_accuracy: 0.4239 - val_sparse_categorical_crossentropy: 2.1490\n", "Epoch 2/10\n", - "4918/4918 [==============================] - 91s 18ms/step - loss: 2.1113 - sparse_categorical_accuracy: 0.4201 - sparse_categorical_crossentropy: 2.1739 - val_loss: 1.9439 - val_sparse_categorical_accuracy: 0.4548 - val_sparse_categorical_crossentropy: 1.9911\n", + "4918/4918 [==============================] - 129s 26ms/step - loss: 2.1057 - sparse_categorical_accuracy: 0.4225 - sparse_categorical_crossentropy: 2.1676 - val_loss: 2.0053 - val_sparse_categorical_accuracy: 0.4330 - val_sparse_categorical_crossentropy: 2.0561\n", "Epoch 3/10\n", - "4918/4918 [==============================] - 90s 18ms/step - loss: 1.9727 - sparse_categorical_accuracy: 0.4516 - sparse_categorical_crossentropy: 2.0184 - val_loss: 1.8663 - val_sparse_categorical_accuracy: 0.4672 - val_sparse_categorical_crossentropy: 1.9032\n", + "4918/4918 [==============================] - 128s 26ms/step - loss: 1.9673 - sparse_categorical_accuracy: 0.4541 - sparse_categorical_crossentropy: 2.0124 - val_loss: 1.8902 - val_sparse_categorical_accuracy: 0.4703 - val_sparse_categorical_crossentropy: 1.9283\n", "Epoch 4/10\n", - "4918/4918 [==============================] - 89s 18ms/step - loss: 1.8827 - sparse_categorical_accuracy: 0.4718 - sparse_categorical_crossentropy: 1.9195 - val_loss: 1.8593 - val_sparse_categorical_accuracy: 0.4698 - val_sparse_categorical_crossentropy: 1.8943\n", + "4918/4918 [==============================] - 130s 26ms/step - loss: 1.8787 - sparse_categorical_accuracy: 0.4740 - sparse_categorical_crossentropy: 1.9149 - val_loss: 1.8447 - val_sparse_categorical_accuracy: 0.4803 - val_sparse_categorical_crossentropy: 1.8784\n", "Epoch 5/10\n", - "4918/4918 [==============================] - 90s 18ms/step - loss: 1.8079 - sparse_categorical_accuracy: 0.4894 - sparse_categorical_crossentropy: 1.8400 - val_loss: 1.7997 - val_sparse_categorical_accuracy: 0.4880 - val_sparse_categorical_crossentropy: 1.8320\n", + "4918/4918 [==============================] - 129s 26ms/step - loss: 1.8062 - sparse_categorical_accuracy: 0.4904 - sparse_categorical_crossentropy: 1.8378 - val_loss: 1.8227 - val_sparse_categorical_accuracy: 0.4787 - val_sparse_categorical_crossentropy: 1.8559\n", "Epoch 6/10\n", - "4918/4918 [==============================] - 90s 18ms/step - loss: 1.7434 - sparse_categorical_accuracy: 0.5032 - sparse_categorical_crossentropy: 1.7732 - val_loss: 1.7836 - val_sparse_categorical_accuracy: 0.4879 - val_sparse_categorical_crossentropy: 1.8171\n", + "4918/4918 [==============================] - 130s 26ms/step - loss: 1.7416 - sparse_categorical_accuracy: 0.5043 - sparse_categorical_crossentropy: 1.7708 - val_loss: 1.7801 - val_sparse_categorical_accuracy: 0.4919 - val_sparse_categorical_crossentropy: 1.8128\n", "Epoch 7/10\n", - "4918/4918 [==============================] - 89s 18ms/step - loss: 1.6894 - sparse_categorical_accuracy: 0.5161 - sparse_categorical_crossentropy: 1.7182 - val_loss: 1.7512 - val_sparse_categorical_accuracy: 0.4984 - val_sparse_categorical_crossentropy: 1.7851\n", + "4918/4918 [==============================] - 133s 27ms/step - loss: 1.6856 - sparse_categorical_accuracy: 0.5167 - sparse_categorical_crossentropy: 1.7136 - val_loss: 1.7456 - val_sparse_categorical_accuracy: 0.4999 - val_sparse_categorical_crossentropy: 1.7787\n", "Epoch 8/10\n", - "4918/4918 [==============================] - 91s 18ms/step - loss: 1.6422 - sparse_categorical_accuracy: 0.5261 - sparse_categorical_crossentropy: 1.6702 - val_loss: 1.7340 - val_sparse_categorical_accuracy: 0.5009 - val_sparse_categorical_crossentropy: 1.7686\n", + "4918/4918 [==============================] - 130s 26ms/step - loss: 1.6424 - sparse_categorical_accuracy: 0.5263 - sparse_categorical_crossentropy: 1.6700 - val_loss: 1.7497 - val_sparse_categorical_accuracy: 0.4955 - val_sparse_categorical_crossentropy: 1.7849\n", "Epoch 9/10\n", - "4918/4918 [==============================] - 90s 18ms/step - loss: 1.6122 - sparse_categorical_accuracy: 0.5329 - sparse_categorical_crossentropy: 1.6396 - val_loss: 1.7371 - val_sparse_categorical_accuracy: 0.5003 - val_sparse_categorical_crossentropy: 1.7728\n", + "4918/4918 [==============================] - 131s 27ms/step - loss: 1.6112 - sparse_categorical_accuracy: 0.5332 - sparse_categorical_crossentropy: 1.6382 - val_loss: 1.7343 - val_sparse_categorical_accuracy: 0.5013 - val_sparse_categorical_crossentropy: 1.7693\n", "Epoch 10/10\n", - "4918/4918 [==============================] - 89s 18ms/step - loss: 1.5958 - sparse_categorical_accuracy: 0.5365 - sparse_categorical_crossentropy: 1.6227 - val_loss: 1.7306 - val_sparse_categorical_accuracy: 0.5013 - val_sparse_categorical_crossentropy: 1.7659\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING:absl:Found untraced functions such as _update_step_xla while saving (showing 1 of 1). These functions will not be directly callable after loading.\n" + "4918/4918 [==============================] - 132s 27ms/step - loss: 1.5959 - sparse_categorical_accuracy: 0.5372 - sparse_categorical_crossentropy: 1.6224 - val_loss: 1.7417 - val_sparse_categorical_accuracy: 0.4991 - val_sparse_categorical_crossentropy: 1.7773\n" ] }, { "data": { "text/plain": [ - "RunResult(preprocess_model=\u003ckeras.engine.functional.Functional object at 0x7c1309331660\u003e, base_model=\u003ckeras.engine.sequential.Sequential object at 0x7c124546f670\u003e, trained_model=\u003ckeras.engine.functional.Functional object at 0x7c12454f41f0\u003e)" + "RunResult(preprocess_model=\u003ckeras.src.engine.functional.Functional object at 0x7fed485eee90\u003e, base_model=\u003ckeras.src.engine.sequential.Sequential object at 0x7febfa0b3280\u003e, trained_model=\u003ckeras.src.engine.functional.Functional object at 0x7fec8811a9e0\u003e)" ] }, "execution_count": 17, @@ -997,7 +1008,7 @@ }, "source": [ "## Inference using Exported Model\n", - "At the end of training, a SavedModel is exported by the Runner for inference. For demonstration, let's call the exported model on the validation dataset from above, but without labels. We load it as a SavedModel, like TF Serving would. Analogous to the SaveOptions above, LoadOptions with a device placement are necessary when connecting this Colab to a TPU worker.\n", + "At the end of training, a SavedModel is exported by the Runner for inference. For demonstration, let's call the exported model on the validation dataset from above, but without labels. We load it as a SavedModel, like TF Serving would.\n", "\n", "NOTE: TF Serving usually expects examples in form of serialized strings, therefore we explicitly convert the graph tensors to serialized string format and pass it to the loaded model.\n", "\n", @@ -1012,32 +1023,40 @@ "colab": { "base_uri": "https://localhost:8080/" }, + "executionInfo": { + "elapsed": 51166, + "status": "ok", + "timestamp": 1711474297507, + "user": { + "displayName": "", + "userId": "" + }, + "user_tz": -60 + }, "id": "ki33s9EpsQnF", - "outputId": "b87d9ded-70f8-4abb-f9d1-fb5a547d59ff" + "outputId": "8e6a7ba6-514e-4dda-f96f-deea16d185b1" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "The predicted class for input 0 is 9 with predicted probability 0.4965\n", - "The predicted class for input 1 is 189 with predicted probability 0.2402\n", - "The predicted class for input 2 is 189 with predicted probability 0.4836\n", - "The predicted class for input 3 is 158 with predicted probability 0.958\n", - "The predicted class for input 4 is 341 with predicted probability 0.2332\n", - "The predicted class for input 5 is 189 with predicted probability 0.5669\n", - "The predicted class for input 6 is 209 with predicted probability 0.3472\n", - "The predicted class for input 7 is 247 with predicted probability 0.7285\n", - "The predicted class for input 8 is 89 with predicted probability 0.4504\n", - "The predicted class for input 9 is 311 with predicted probability 0.8283\n" + "The predicted class for input 0 is 9 with predicted probability 0.3137\n", + "The predicted class for input 1 is 281 with predicted probability 0.2777\n", + "The predicted class for input 2 is 189 with predicted probability 0.4749\n", + "The predicted class for input 3 is 158 with predicted probability 0.9535\n", + "The predicted class for input 4 is 82 with predicted probability 0.3277\n", + "The predicted class for input 5 is 247 with predicted probability 0.299\n", + "The predicted class for input 6 is 209 with predicted probability 0.4056\n", + "The predicted class for input 7 is 247 with predicted probability 0.593\n", + "The predicted class for input 8 is 192 with predicted probability 0.5478\n", + "The predicted class for input 9 is 311 with predicted probability 0.7335\n" ] } ], "source": [ "# Load model.\n", - "load_options = tf.saved_model.LoadOptions(experimental_io_device=\"/job:localhost\")\n", - "saved_model = tf.saved_model.load(os.path.join(trainer.model_dir, \"export\"),\n", - " options=load_options)\n", + "saved_model = tf.saved_model.load(os.path.join(trainer.model_dir, \"export\"))\n", "signature_fn = saved_model.signatures[\n", " tf.saved_model.DEFAULT_SERVING_SIGNATURE_DEF_KEY]\n", "\n", @@ -1104,6 +1123,7 @@ "ScitaPqhKtuW" ], "name": "Solving OGBN-MAG end-to-end with TF-GNN", + "gpuType": "V28", "provenance": [] }, "kernelspec": { diff --git a/examples/notebooks/ogbn_mag_indepth.ipynb b/examples/notebooks/ogbn_mag_indepth.ipynb index 63efbb43..2e1ca8f4 100644 --- a/examples/notebooks/ogbn_mag_indepth.ipynb +++ b/examples/notebooks/ogbn_mag_indepth.ipynb @@ -99,24 +99,24 @@ "base_uri": "https://localhost:8080/" }, "executionInfo": { - "elapsed": 4414, + "elapsed": 6665, "status": "ok", - "timestamp": 1674931789754, + "timestamp": 1711613646349, "user": { - "displayName": "Parth Kothari", - "userId": "14151824495951403621" + "displayName": "Arno Eigenwillig", + "userId": "11315922694496346185" }, "user_tz": -60 }, "id": "6wht6mjUuZeA", - "outputId": "c6b95c89-1f8e-4f1d-9a0d-a070ca658c29" + "outputId": "8ee4cfaa-88f5-4cf6-a1da-00346bbd1857" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Running TF-GNN 0.5.0 under TensorFlow 2.9.2.\n" + "Running TF-GNN 1.0.2 under TensorFlow 2.15.0.\n" ] } ], @@ -242,17 +242,17 @@ "base_uri": "https://localhost:8080/" }, "executionInfo": { - "elapsed": 10, + "elapsed": 5, "status": "ok", - "timestamp": 1670239318853, + "timestamp": 1711613646688, "user": { - "displayName": "Parth Kothari", - "userId": "14151824495951403621" + "displayName": "Arno Eigenwillig", + "userId": "11315922694496346185" }, "user_tz": -60 }, "id": "j3aOVEReYOLO", - "outputId": "0d4af2dc-de5d-4950-f373-d3997f132645" + "outputId": "f2976ee0-89ed-4ff2-8b7b-1f23506b2990" }, "outputs": [ { @@ -260,13 +260,13 @@ "text/plain": [ "context {\n", " features {\n", - " key: \"sample_id\"\n", + " key: \"seed_id\"\n", " value {\n", " dtype: DT_STRING\n", " }\n", " }\n", " features {\n", - " key: \"seed_id\"\n", + " key: \"sample_id\"\n", " value {\n", " dtype: DT_STRING\n", " }\n", @@ -275,9 +275,42 @@ " }\n", "}\n", "node_sets {\n", - " key: \"author\"\n", + " key: \"paper\"\n", " value {\n", " features {\n", + " key: \"year\"\n", + " value {\n", + " dtype: DT_INT64\n", + " shape {\n", + " dim {\n", + " size: 1\n", + " }\n", + " }\n", + " }\n", + " }\n", + " features {\n", + " key: \"labels\"\n", + " value {\n", + " dtype: DT_INT64\n", + " shape {\n", + " dim {\n", + " size: 1\n", + " }\n", + " }\n", + " }\n", + " }\n", + " features {\n", + " key: \"feat\"\n", + " value {\n", + " dtype: DT_FLOAT\n", + " shape {\n", + " dim {\n", + " size: 128\n", + " }\n", + " }\n", + " }\n", + " }\n", + " features {\n", " key: \"#id\"\n", " value {\n", " dtype: DT_STRING\n", @@ -288,7 +321,7 @@ " }\n", "}\n", "node_sets {\n", - " key: \"field_of_study\"\n", + " key: \"institution\"\n", " value {\n", " features {\n", " key: \"#id\"\n", @@ -301,7 +334,7 @@ " }\n", "}\n", "node_sets {\n", - " key: \"institution\"\n", + " key: \"field_of_study\"\n", " value {\n", " features {\n", " key: \"#id\"\n", @@ -314,7 +347,7 @@ " }\n", "}\n", "node_sets {\n", - " key: \"paper\"\n", + " key: \"author\"\n", " value {\n", " features {\n", " key: \"#id\"\n", @@ -322,56 +355,27 @@ " dtype: DT_STRING\n", " }\n", " }\n", - " features {\n", - " key: \"feat\"\n", - " value {\n", - " dtype: DT_FLOAT\n", - " shape {\n", - " dim {\n", - " size: 128\n", - " }\n", - " }\n", - " }\n", - " }\n", - " features {\n", - " key: \"labels\"\n", - " value {\n", - " dtype: DT_INT64\n", - " shape {\n", - " dim {\n", - " size: 1\n", - " }\n", - " }\n", - " }\n", - " }\n", - " features {\n", - " key: \"year\"\n", - " value {\n", - " dtype: DT_INT64\n", - " shape {\n", - " dim {\n", - " size: 1\n", - " }\n", - " }\n", - " }\n", - " }\n", " metadata {\n", " }\n", " }\n", "}\n", "edge_sets {\n", - " key: \"affiliated_with\"\n", + " key: \"written\"\n", " value {\n", - " source: \"author\"\n", - " target: \"institution\"\n", + " source: \"paper\"\n", + " target: \"author\"\n", " metadata {\n", + " extra {\n", + " key: \"edge_type\"\n", + " value: \"reversed\"\n", + " }\n", " }\n", " }\n", "}\n", "edge_sets {\n", - " key: \"cites\"\n", + " key: \"writes\"\n", " value {\n", - " source: \"paper\"\n", + " source: \"author\"\n", " target: \"paper\"\n", " metadata {\n", " }\n", @@ -387,24 +391,20 @@ " }\n", "}\n", "edge_sets {\n", - " key: \"writes\"\n", + " key: \"cites\"\n", " value {\n", - " source: \"author\"\n", + " source: \"paper\"\n", " target: \"paper\"\n", " metadata {\n", " }\n", " }\n", "}\n", "edge_sets {\n", - " key: \"written\"\n", + " key: \"affiliated_with\"\n", " value {\n", - " source: \"paper\"\n", - " target: \"author\"\n", + " source: \"author\"\n", + " target: \"institution\"\n", " metadata {\n", - " extra {\n", - " key: \"edge_type\"\n", - " value: \"reversed\"\n", - " }\n", " }\n", " }\n", "}" @@ -502,17 +502,17 @@ "base_uri": "https://localhost:8080/" }, "executionInfo": { - "elapsed": 864, + "elapsed": 775, "status": "ok", - "timestamp": 1670239323809, + "timestamp": 1711613648672, "user": { - "displayName": "Parth Kothari", - "userId": "14151824495951403621" + "displayName": "Arno Eigenwillig", + "userId": "11315922694496346185" }, "user_tz": -60 }, "id": "USZYLqb7J8CI", - "outputId": "83c7a698-3735-4135-966b-1d55371adbe5" + "outputId": "ca58028d-aa85-4be9-c312-f9dffce0a5d5" }, "outputs": [ { @@ -745,17 +745,17 @@ "base_uri": "https://localhost:8080/" }, "executionInfo": { - "elapsed": 5, + "elapsed": 4, "status": "ok", - "timestamp": 1670239324154, + "timestamp": 1711613649035, "user": { - "displayName": "Parth Kothari", - "userId": "14151824495951403621" + "displayName": "Arno Eigenwillig", + "userId": "11315922694496346185" }, "user_tz": -60 }, "id": "7n_4Ui8t0HeR", - "outputId": "ebc79123-7496-4f57-a13e-7c9d3377c4a6" + "outputId": "4d5490a2-7fe1-4a32-97b6-33aa8da89a10" }, "outputs": [ { @@ -829,17 +829,17 @@ "base_uri": "https://localhost:8080/" }, "executionInfo": { - "elapsed": 9295, + "elapsed": 4325, "status": "ok", - "timestamp": 1670239333445, + "timestamp": 1711613653357, "user": { - "displayName": "Parth Kothari", - "userId": "14151824495951403621" + "displayName": "Arno Eigenwillig", + "userId": "11315922694496346185" }, "user_tz": -60 }, "id": "ztkM4Ckxu0Hr", - "outputId": "db581249-45b5-4648-8a34-7293ad504752" + "outputId": "bbd70a96-c3a2-477c-8a0c-9dbbd339dcce" }, "outputs": [ { @@ -878,17 +878,17 @@ "base_uri": "https://localhost:8080/" }, "executionInfo": { - "elapsed": 341, + "elapsed": 370, "status": "ok", - "timestamp": 1670239333785, + "timestamp": 1711613653710, "user": { - "displayName": "Parth Kothari", - "userId": "14151824495951403621" + "displayName": "Arno Eigenwillig", + "userId": "11315922694496346185" }, "user_tz": -60 }, "id": "yPgKVJ4Mu0Ez", - "outputId": "ad81ef8d-b100-44bf-f3b7-c5e1a44ab1ad" + "outputId": "e7eee331-6cb0-449f-8150-9d6c0bc04921" }, "outputs": [ { @@ -933,17 +933,17 @@ "base_uri": "https://localhost:8080/" }, "executionInfo": { - "elapsed": 7, + "elapsed": 20, "status": "ok", - "timestamp": 1670239333786, + "timestamp": 1711613653710, "user": { - "displayName": "Parth Kothari", - "userId": "14151824495951403621" + "displayName": "Arno Eigenwillig", + "userId": "11315922694496346185" }, "user_tz": -60 }, "id": "Q5mYSUYIODpR", - "outputId": "cf63d4cf-96ba-4bcb-e4b7-1f6bf69d8284" + "outputId": "a7ec92a7-40e1-4dc2-e43c-eadd4940072b" }, "outputs": [ { @@ -984,7 +984,9 @@ "id": "n7dDGuQOFYqK" }, "source": [ - "We use TensorFlow's [Distribution Strategy](https://www.tensorflow.org/guide/distributed_training) API to write a model that can train in parallel on multiple [Cloud TPUs](https://cloud.google.com/tpu), multiple GPUs, or maybe just locally on CPU. (This is needed on Colab to use Cloud TPUs. This is not required to use the single GPU on a Colab, but we might as well show how it's done for the general case.)" + "We use TensorFlow's [Distribution Strategy](https://www.tensorflow.org/guide/distributed_training) API to write a model that can train in parallel on multiple [Cloud TPUs](https://cloud.google.com/tpu), multiple GPUs, or maybe just locally on CPU. A distribution strategy is not required for a single GPU or CPU, but we might as well show how it's done for the general case.\n", + "\n", + "For CloudTPU, the following code assumes the Colab runtime type \"TPU v2\", that is, a TPU VM. Do not use the runtime type \"TPU (deprecated)\", which uses a TPU Node on a separate VM." ] }, { @@ -995,49 +997,45 @@ "base_uri": "https://localhost:8080/" }, "executionInfo": { - "elapsed": 6, + "elapsed": 16, "status": "ok", - "timestamp": 1670239333786, + "timestamp": 1711613653711, "user": { - "displayName": "Parth Kothari", - "userId": "14151824495951403621" + "displayName": "Arno Eigenwillig", + "userId": "11315922694496346185" }, "user_tz": -60 }, "id": "5Te_iGkVwYPB", - "outputId": "b50257c8-483c-4774-f3c8-42d1c9f51a1d" + "outputId": "615c099a-f0de-4993-a49a-f7235697c676" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "GPU 0: Tesla T4 (UUID: GPU-fc6155fe-adbf-f710-3e1d-b4f981ab8cde)\n", "Using MirroredStrategy for GPUs\n", + "GPU 0: Tesla T4 (UUID: GPU-17a45941-9927-e24a-29e6-ba2ba8d0505c)\n", "Found 1 replicas in sync\n" ] } ], "source": [ - "try:\n", - " tpu_resolver = tf.distribute.cluster_resolver.TPUClusterResolver()\n", - " print(\"Running on TPU \", tpu_resolver.cluster_spec().as_dict()[\"worker\"])\n", - "except ValueError:\n", - " tpu_resolver = None\n", - "if tpu_resolver:\n", + "if tf.config.list_physical_devices(\"TPU\"):\n", " print(\"Using TPUStrategy\")\n", + " tpu_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(\"local\")\n", " tf.config.experimental_connect_to_cluster(tpu_resolver)\n", " tf.tpu.experimental.initialize_tpu_system(tpu_resolver)\n", " strategy = tf.distribute.TPUStrategy(tpu_resolver)\n", " assert isinstance(strategy, tf.distribute.TPUStrategy)\n", "elif tf.config.list_physical_devices(\"GPU\"):\n", + " print(f\"Using MirroredStrategy for GPUs\")\n", " gpu_list = !nvidia-smi -L\n", " print(\"\\n\".join(gpu_list))\n", - " print(f\"Using MirroredStrategy for GPUs\")\n", " strategy = tf.distribute.MirroredStrategy()\n", "else:\n", - " strategy = tf.distribute.get_strategy()\n", " print(f\"Using default strategy\")\n", + " strategy = tf.distribute.get_strategy()\n", "print(f\"Found {strategy.num_replicas_in_sync} replicas in sync\")" ] }, @@ -1067,17 +1065,17 @@ "base_uri": "https://localhost:8080/" }, "executionInfo": { - "elapsed": 3, + "elapsed": 260, "status": "ok", - "timestamp": 1670239333786, + "timestamp": 1711613653966, "user": { - "displayName": "Parth Kothari", - "userId": "14151824495951403621" + "displayName": "Arno Eigenwillig", + "userId": "11315922694496346185" }, "user_tz": -60 }, "id": "06AMoGjTJmUg", - "outputId": "a252838e-8b55-4976-b945-bc1a8325842b" + "outputId": "12198e0f-cab0-463b-c375-f7d6aedf0a18" }, "outputs": [ { @@ -1118,24 +1116,24 @@ "base_uri": "https://localhost:8080/" }, "executionInfo": { - "elapsed": 385, + "elapsed": 10, "status": "ok", - "timestamp": 1670239334169, + "timestamp": 1711613653967, "user": { - "displayName": "Parth Kothari", - "userId": "14151824495951403621" + "displayName": "Arno Eigenwillig", + "userId": "11315922694496346185" }, "user_tz": -60 }, "id": "L8Mny61cJmJx", - "outputId": "295c8df8-c1f2-40a3-dc27-121d089b6e84" + "outputId": "2e2edd3a-4634-4fa4-f301-a8d99ad09cfd" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Validation uses a global batch size of 32 (32 per replica).\n" + "Validation uses a global batch size of 128 (128 per replica).\n" ] } ], @@ -1180,17 +1178,17 @@ "base_uri": "https://localhost:8080/" }, "executionInfo": { - "elapsed": 3, + "elapsed": 8, "status": "ok", - "timestamp": 1670239334169, + "timestamp": 1711613653967, "user": { - "displayName": "Parth Kothari", - "userId": "14151824495951403621" + "displayName": "Arno Eigenwillig", + "userId": "11315922694496346185" }, "user_tz": -60 }, "id": "ulDdkMmFJmMQ", - "outputId": "91dd20b7-8e12-4848-ad89-a91a36911276" + "outputId": "5d2f5963-d9ab-4c27-a529-c87893c3af0c" }, "outputs": [ { @@ -1335,9 +1333,32 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "Me3cV-Ws1MHJ" + "colab": { + "base_uri": "https://localhost:8080/" + }, + "executionInfo": { + "elapsed": 9470, + "status": "ok", + "timestamp": 1711613663432, + "user": { + "displayName": "Arno Eigenwillig", + "userId": "11315922694496346185" + }, + "user_tz": -60 + }, + "id": "Me3cV-Ws1MHJ", + "outputId": "54877871-5950-4862-ad16-d30ac376c941" }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:tensorflow:Mapping types may not work well with tf.nest. Prefer using MutableMapping for \u003cclass 'tensorflow_gnn.graph.graph_tensor._ImmutableMapping'\u003e\n", + "WARNING:tensorflow:Mapping types may not work well with tf.nest. Prefer using MutableMapping for \u003cclass 'tensorflow_gnn.graph.graph_tensor._ImmutableMapping'\u003e\n" + ] + } + ], "source": [ "example_input_spec = tfgnn.create_graph_spec_from_schema_pb(graph_schema)\n", "\n", @@ -1602,17 +1623,17 @@ "base_uri": "https://localhost:8080/" }, "executionInfo": { - "elapsed": 203225, + "elapsed": 213704, "status": "ok", - "timestamp": 1676544546128, + "timestamp": 1711613886387, "user": { - "displayName": "Parth Kothari", - "userId": "14151824495951403621" + "displayName": "Arno Eigenwillig", + "userId": "11315922694496346185" }, "user_tz": -60 }, "id": "b6x57RqJ2Pi-", - "outputId": "bc087800-7641-4deb-e821-f3bea55b247d" + "outputId": "ba78f7fd-3159-43cb-a393-bd4dc009f7ef" }, "outputs": [ { @@ -1640,15 +1661,15 @@ "name": "stdout", "output_type": "stream", "text": [ - "49/49 [==============================] - 74s 2s/step - loss: 5.2805 - sparse_categorical_accuracy: 0.0611 - sparse_categorical_crossentropy: 5.2522 - val_loss: 5.1825 - val_sparse_categorical_accuracy: 0.0547 - val_sparse_categorical_crossentropy: 5.1551\n", + "49/49 [==============================] - 77s 2s/step - loss: 5.2896 - sparse_categorical_accuracy: 0.0584 - sparse_categorical_crossentropy: 5.2613 - val_loss: 5.2338 - val_sparse_categorical_accuracy: 0.0297 - val_sparse_categorical_crossentropy: 5.2064\n", "Epoch 2/5\n", - "49/49 [==============================] - 35s 706ms/step - loss: 4.9080 - sparse_categorical_accuracy: 0.0834 - sparse_categorical_crossentropy: 4.8812 - val_loss: 4.8607 - val_sparse_categorical_accuracy: 0.0734 - val_sparse_categorical_crossentropy: 4.8343\n", + "49/49 [==============================] - 40s 807ms/step - loss: 4.9051 - sparse_categorical_accuracy: 0.0829 - sparse_categorical_crossentropy: 4.8783 - val_loss: 4.8596 - val_sparse_categorical_accuracy: 0.0719 - val_sparse_categorical_crossentropy: 4.8332\n", "Epoch 3/5\n", - "49/49 [==============================] - 30s 612ms/step - loss: 4.6028 - sparse_categorical_accuracy: 0.1258 - sparse_categorical_crossentropy: 4.5766 - val_loss: 4.5573 - val_sparse_categorical_accuracy: 0.1016 - val_sparse_categorical_crossentropy: 4.5313\n", + "49/49 [==============================] - 32s 662ms/step - loss: 4.6366 - sparse_categorical_accuracy: 0.1089 - sparse_categorical_crossentropy: 4.6104 - val_loss: 4.6487 - val_sparse_categorical_accuracy: 0.1063 - val_sparse_categorical_crossentropy: 4.6227\n", "Epoch 4/5\n", - "49/49 [==============================] - 35s 710ms/step - loss: 4.4082 - sparse_categorical_accuracy: 0.1432 - sparse_categorical_crossentropy: 4.3822 - val_loss: 4.4527 - val_sparse_categorical_accuracy: 0.0984 - val_sparse_categorical_crossentropy: 4.4268\n", + "49/49 [==============================] - 31s 635ms/step - loss: 4.5127 - sparse_categorical_accuracy: 0.1189 - sparse_categorical_crossentropy: 4.4867 - val_loss: 4.4764 - val_sparse_categorical_accuracy: 0.0922 - val_sparse_categorical_crossentropy: 4.4505\n", "Epoch 5/5\n", - "49/49 [==============================] - 28s 572ms/step - loss: 4.3681 - sparse_categorical_accuracy: 0.1416 - sparse_categorical_crossentropy: 4.3422 - val_loss: 4.4270 - val_sparse_categorical_accuracy: 0.1000 - val_sparse_categorical_crossentropy: 4.4010\n" + "49/49 [==============================] - 31s 628ms/step - loss: 4.4421 - sparse_categorical_accuracy: 0.1288 - sparse_categorical_crossentropy: 4.4162 - val_loss: 4.5154 - val_sparse_categorical_accuracy: 0.0969 - val_sparse_categorical_crossentropy: 4.4895\n" ] } ], @@ -1721,17 +1742,17 @@ "base_uri": "https://localhost:8080/" }, "executionInfo": { - "elapsed": 27785, + "elapsed": 27862, "status": "ok", - "timestamp": 1670239531162, + "timestamp": 1711613916561, "user": { - "displayName": "Parth Kothari", - "userId": "14151824495951403621" + "displayName": "Arno Eigenwillig", + "userId": "11315922694496346185" }, "user_tz": -60 }, "id": "8sauKPKlL8nb", - "outputId": "cf292f5f-fd23-48d7-c5df-0860aa25f496" + "outputId": "02db1b0a-7a71-4eaa-b044-5e0c042a62c9" }, "outputs": [ { @@ -1746,16 +1767,15 @@ "output_type": "stream", "text": [ "WARNING:tensorflow:Compiled the loaded model, but the compiled metrics have yet to be built. `model.compile_metrics` will be empty until you train or evaluate the model.\n", - "WARNING:absl:Found untraced functions such as node_set_update_layer_call_fn, node_set_update_layer_call_and_return_conditional_losses, node_set_update_1_layer_call_fn, node_set_update_1_layer_call_and_return_conditional_losses, node_set_update_2_layer_call_fn while saving (showing 5 of 72). These functions will not be directly callable after loading.\n", - "/usr/local/lib/python3.8/dist-packages/tensorflow/python/saved_model/nested_structure_coder.py:522: UserWarning: Encoding a StructuredValue with type tensorflow_gnn.GraphTensorSpec; loading this StructuredValue will require that this type be imported and registered.\n", + "/usr/local/lib/python3.10/dist-packages/tensorflow/python/saved_model/nested_structure_coder.py:458: UserWarning: Encoding a StructuredValue with type tensorflow_gnn.GraphTensorSpec; loading this StructuredValue will require that this type be imported and registered.\n", " warnings.warn(\"Encoding a StructuredValue with type %s; loading this \"\n", - "/usr/local/lib/python3.8/dist-packages/tensorflow/python/saved_model/nested_structure_coder.py:522: UserWarning: Encoding a StructuredValue with type tensorflow_gnn.ContextSpec.v2; loading this StructuredValue will require that this type be imported and registered.\n", + "/usr/local/lib/python3.10/dist-packages/tensorflow/python/saved_model/nested_structure_coder.py:458: UserWarning: Encoding a StructuredValue with type tensorflow_gnn.ContextSpec.v2; loading this StructuredValue will require that this type be imported and registered.\n", " warnings.warn(\"Encoding a StructuredValue with type %s; loading this \"\n", - "/usr/local/lib/python3.8/dist-packages/tensorflow/python/saved_model/nested_structure_coder.py:522: UserWarning: Encoding a StructuredValue with type tensorflow_gnn.NodeSetSpec; loading this StructuredValue will require that this type be imported and registered.\n", + "/usr/local/lib/python3.10/dist-packages/tensorflow/python/saved_model/nested_structure_coder.py:458: UserWarning: Encoding a StructuredValue with type tensorflow_gnn.NodeSetSpec; loading this StructuredValue will require that this type be imported and registered.\n", " warnings.warn(\"Encoding a StructuredValue with type %s; loading this \"\n", - "/usr/local/lib/python3.8/dist-packages/tensorflow/python/saved_model/nested_structure_coder.py:522: UserWarning: Encoding a StructuredValue with type tensorflow_gnn.EdgeSetSpec; loading this StructuredValue will require that this type be imported and registered.\n", + "/usr/local/lib/python3.10/dist-packages/tensorflow/python/saved_model/nested_structure_coder.py:458: UserWarning: Encoding a StructuredValue with type tensorflow_gnn.EdgeSetSpec; loading this StructuredValue will require that this type be imported and registered.\n", " warnings.warn(\"Encoding a StructuredValue with type %s; loading this \"\n", - "/usr/local/lib/python3.8/dist-packages/tensorflow/python/saved_model/nested_structure_coder.py:522: UserWarning: Encoding a StructuredValue with type tensorflow_gnn.AdjacencySpec; loading this StructuredValue will require that this type be imported and registered.\n", + "/usr/local/lib/python3.10/dist-packages/tensorflow/python/saved_model/nested_structure_coder.py:458: UserWarning: Encoding a StructuredValue with type tensorflow_gnn.AdjacencySpec; loading this StructuredValue will require that this type be imported and registered.\n", " warnings.warn(\"Encoding a StructuredValue with type %s; loading this \"\n" ] } @@ -1763,9 +1783,7 @@ "source": [ "export_path = \"/tmp/exported_keras_model\"\n", "!rm -r {export_path}\n", - "# Save everything on the Colab host (even the variables from TPU memory).\n", - "save_options = tf.saved_model.SaveOptions(experimental_io_device=\"/job:localhost\")\n", - "serving_model.save(export_path, include_optimizer=False, options=save_options)" + "serving_model.save(export_path, include_optimizer=False)" ] }, { @@ -1784,9 +1802,7 @@ "id": "XcW_YAF2MQdJ" }, "source": [ - "For demonstration, let's call the exported model on the example dataset from above, but without labels. We load it as a SavedModel, like TF Serving would. (Using `tf.keras.models.load_model()` instead would rebuild the original Keras layers; see TensorFlow's [Save and load models](https://www.tensorflow.org/tutorials/keras/save_and_load) tutorial for more.)\n", - "\n", - "Note: After connecting this Colab to a TPU worker, explicit device placements are necessary to do the test on the colab host (which has the `/tmp` directory). You can omit those when loading the SavedModel elsewhere." + "For demonstration, let's call the exported model on the example dataset from above, but without labels. We load it as a SavedModel, like TF Serving would. (Using `tf.keras.models.load_model()` instead would rebuild the original Keras layers; see TensorFlow's [Save and load models](https://www.tensorflow.org/tutorials/keras/save_and_load) tutorial for more.)" ] }, { @@ -1797,8 +1813,7 @@ }, "outputs": [], "source": [ - "with tf.device(\"/job:localhost\"):\n", - " restored_model = tf.saved_model.load(export_path)" + "restored_model = tf.saved_model.load(export_path)" ] }, { @@ -1809,33 +1824,33 @@ "base_uri": "https://localhost:8080/" }, "executionInfo": { - "elapsed": 977, + "elapsed": 720, "status": "ok", - "timestamp": 1670239541848, + "timestamp": 1711613924501, "user": { - "displayName": "Parth Kothari", - "userId": "14151824495951403621" + "displayName": "Arno Eigenwillig", + "userId": "11315922694496346185" }, "user_tz": -60 }, "id": "DAxsZQjQ9WwR", - "outputId": "0e944b33-4f71-42dd-d3c6-05c5f20de691" + "outputId": "0b6f4b26-3c9b-4b39-c03c-b1ab5ff7953f" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "The predicted class for input 0 is 258 with predicted probability 0.1439\n", - "The predicted class for input 1 is 1 with predicted probability 0.3566\n", - "The predicted class for input 2 is 1 with predicted probability 0.3649\n", - "The predicted class for input 3 is 134 with predicted probability 0.1926\n", - "The predicted class for input 4 is 258 with predicted probability 0.1489\n", - "The predicted class for input 5 is 1 with predicted probability 0.2027\n", - "The predicted class for input 6 is 134 with predicted probability 0.1512\n", - "The predicted class for input 7 is 134 with predicted probability 0.0338\n", - "The predicted class for input 8 is 258 with predicted probability 0.05497\n", - "The predicted class for input 9 is 134 with predicted probability 0.03303\n" + "The predicted class for input 0 is 258 with predicted probability 0.08846\n", + "The predicted class for input 1 is 1 with predicted probability 0.4106\n", + "The predicted class for input 2 is 1 with predicted probability 0.4112\n", + "The predicted class for input 3 is 134 with predicted probability 0.1854\n", + "The predicted class for input 4 is 258 with predicted probability 0.1949\n", + "The predicted class for input 5 is 1 with predicted probability 0.3494\n", + "The predicted class for input 6 is 134 with predicted probability 0.1438\n", + "The predicted class for input 7 is 112 with predicted probability 0.04063\n", + "The predicted class for input 8 is 112 with predicted probability 0.02867\n", + "The predicted class for input 9 is 283 with predicted probability 0.09065\n" ] } ], @@ -1848,16 +1863,15 @@ "num_examples = 10\n", "clean_examples = [_clean_example_for_serving(gt.numpy()) for gt in itertools.islice(demo_ds, num_examples)]\n", "\n", - "with tf.device(\"/job:localhost\"):\n", - " clean_ds = tf.data.Dataset.from_tensor_slices(clean_examples)\n", - " for serialized_example in clean_ds.batch(num_examples).take(1):\n", - " outputs = restored_model.signatures[\"serving_default\"](\n", - " examples=serialized_example)\n", - " probabilities = outputs[\"probabilities\"].numpy()\n", - " classes = probabilities.argmax(axis=1)\n", - " for i, c in enumerate(classes):\n", - " print(f\"The predicted class for input {i} is {c:3} \"\n", - " f\"with predicted probability {probabilities[i, c]:.4}\")" + "clean_ds = tf.data.Dataset.from_tensor_slices(clean_examples)\n", + "for serialized_example in clean_ds.batch(num_examples).take(1):\n", + " outputs = restored_model.signatures[\"serving_default\"](\n", + " examples=serialized_example)\n", + " probabilities = outputs[\"probabilities\"].numpy()\n", + " classes = probabilities.argmax(axis=1)\n", + " for i, c in enumerate(classes):\n", + " print(f\"The predicted class for input {i} is {c:3} \"\n", + " f\"with predicted probability {probabilities[i, c]:.4}\")" ] }, { @@ -1900,14 +1914,22 @@ "collapsed_sections": [ "ScitaPqhKtuW" ], + "gpuType": "T4", "last_runtime": { "build_target": "//research/colab/notebook:notebook_backend_py3", "kind": "private" }, - "name": "An in-depth look at TF-GNN for OGBN-MAG", - "provenance": [] + "provenance": [ + { + "file_id": "1CYTse8C94LiKNw12_VRsqFAY9eUK5cKC", + "timestamp": 1711619616130 + }, + { + "file_id": "https://github.com/tensorflow/gnn/blob/main/examples/notebooks/ogbn_mag_indepth.ipynb", + "timestamp": 1711612125828 + } + ] }, - "gpuClass": "standard", "kernelspec": { "display_name": "Python 3", "name": "python3"