Merge pull request #576 from microsoft/daden/bugfix

Daden/bugfix improvement and bug fix based on the bug bash feedback
microsoft · Mar 27, 2020 · 2bc3203 · 2bc3203
2 parents 806d5fb + 2748b98
commit 2bc3203
Show file tree

Hide file tree

Showing 9 changed files with 147 additions and 304 deletions.
diff --git a/examples/text_summarization/abstractive_summarization_bertsumabs_cnndm.ipynb b/examples/text_summarization/abstractive_summarization_bertsumabs_cnndm.ipynb
@@ -46,7 +46,8 @@
    "metadata": {},
    "source": [
     "## Before you start\n",
-    "Set QUICK_RUN = True to run the notebook on a small subset of data and a smaller number of steps. If QUICK_RUN = False, the notebook takes about 5 hours to run on a VM with 4 16GB NVIDIA V100 GPUs. Finetuning costs around 1.5 hours and inferecing costs around 3.5 hour.  Better performance can be achieved by increasing the MAX_STEPS.\n",
+    "\n",
+    "It's recommended to run this notebook on GPU machines as it's very computationally intensive. Set QUICK_RUN = True to run the notebook on a small subset of data and a smaller number of steps. If QUICK_RUN = False, the notebook takes about 5 hours to run on a VM with 4 16GB NVIDIA V100 GPUs. Finetuning costs around 1.5 hours and inferecing costs around 3.5 hour.  Better performance can be achieved by increasing the MAX_STEPS.\n",
     "\n",
     "* **ROUGE Evalation**: To run rouge evaluation, please refer to the section of compute_rouge_perl in [summarization_evaluation.ipynb](./summarization_evaluation.ipynb) for setup.\n",
     "\n",
@@ -92,11 +93,18 @@
     "if nlp_path not in sys.path:\n",
     "    sys.path.insert(0, nlp_path)\n",
     "\n",
-    "from utils_nlp.models.transformers.abstractive_summarization_bertsum import BertSumAbs, BertSumAbsProcessor\n",
+    "from utils_nlp.models.transformers.abstractive_summarization_bertsum import (\n",
+    "    BertSumAbs,\n",
+    "    BertSumAbsProcessor,\n",
+    ")\n",
     "\n",
     "from utils_nlp.dataset.cnndm import CNNDMSummarizationDataset\n",
     "from utils_nlp.eval import compute_rouge_python\n",
     "\n",
+    "from utils_nlp.models.transformers.datasets import SummarizationDataset\n",
+    "import nltk\n",
+    "from nltk import tokenize\n",
+    "\n",
     "import pandas as pd\n",
     "import pprint\n",
     "import scrapbook as sb"
@@ -139,8 +147,8 @@
    "outputs": [],
    "source": [
     "train_dataset, test_dataset = CNNDMSummarizationDataset(\n",
-    "            top_n=TOP_N, local_cache_path=DATA_PATH, prepare_extractive=False\n",
-    "        )"
+    "    top_n=TOP_N, local_cache_path=DATA_PATH, prepare_extractive=False\n",
+    ")"
    ]
   },
   {
@@ -190,36 +198,41 @@
     "MAX_SOURCE_SEQ_LENGTH = 640\n",
     "MAX_TARGET_SEQ_LENGTH = 140\n",
     "\n",
-    "# mixed precision setting. To enable mixed precision training, follow instructions in SETUP.md. \n",
+    "# mixed precision setting. To enable mixed precision training, follow instructions in SETUP.md.\n",
     "FP16 = False\n",
     "if FP16:\n",
-    "    FP16_OPT_LEVEL=\"O2\"\n",
-    "    \n",
+    "    FP16_OPT_LEVEL = \"O2\"\n",
+    "\n",
     "# fine-tuning parameters\n",
     "# batch size, unit is the number of tokens\n",
-    "BATCH_SIZE_PER_GPU = 3\n",
+    "BATCH_SIZE_PER_GPU = 1\n",
     "\n",
     "\n",
     "# GPU used for training\n",
     "NUM_GPUS = torch.cuda.device_count()\n",
+    "if NUM_GPUS > 0:\n",
+    "    BATCH_SIZE = NUM_GPUS * BATCH_SIZE_PER_GPU\n",
+    "else:\n",
+    "    BATCH_SIZE = 1\n",
+    "\n",
     "\n",
     "# Learning rate\n",
-    "LEARNING_RATE_BERT=5e-4/2.0\n",
-    "LEARNING_RATE_DEC=0.05/2.0\n",
+    "LEARNING_RATE_BERT = 5e-4 / 2.0\n",
+    "LEARNING_RATE_DEC = 0.05 / 2.0\n",
     "\n",
     "\n",
     "# How often the statistics reports show up in training, unit is step.\n",
-    "REPORT_EVERY=10\n",
-    "SAVE_EVERY=500\n",
+    "REPORT_EVERY = 10\n",
+    "SAVE_EVERY = 500\n",
     "\n",
     "# total number of steps for training\n",
-    "MAX_STEPS=1e3\n",
-    "   \n",
+    "MAX_STEPS = 1e3\n",
+    "\n",
     "if not QUICK_RUN:\n",
-    "    MAX_STEPS=5e3\n",
+    "    MAX_STEPS = 5e3\n",
     "\n",
-    "WARMUP_STEPS_BERT=2000\n",
-    "WARMUP_STEPS_DEC=1000   \n"
+    "WARMUP_STEPS_BERT = 2000\n",
+    "WARMUP_STEPS_DEC = 1000"
    ]
   },
   {
@@ -253,21 +266,20 @@
    },
    "outputs": [],
    "source": [
-    "\n",
     "summarizer.fit(\n",
-    "        train_dataset,\n",
-    "        num_gpus=NUM_GPUS,\n",
-    "        batch_size=BATCH_SIZE_PER_GPU*NUM_GPUS,\n",
-    "        max_steps=MAX_STEPS,\n",
-    "        learning_rate_bert=LEARNING_RATE_BERT,\n",
-    "        learning_rate_dec=LEARNING_RATE_DEC,\n",
-    "        warmup_steps_bert=WARMUP_STEPS_BERT,\n",
-    "        warmup_steps_dec=WARMUP_STEPS_DEC,\n",
-    "        save_every=SAVE_EVERY,\n",
-    "        report_every=REPORT_EVERY*5,\n",
-    "        fp16=FP16,\n",
-    "        # checkpoint=\"saved checkpoint path\"\n",
-    ")\n"
+    "    train_dataset,\n",
+    "    num_gpus=NUM_GPUS,\n",
+    "    batch_size=BATCH_SIZE,\n",
+    "    max_steps=MAX_STEPS,\n",
+    "    learning_rate_bert=LEARNING_RATE_BERT,\n",
+    "    learning_rate_dec=LEARNING_RATE_DEC,\n",
+    "    warmup_steps_bert=WARMUP_STEPS_BERT,\n",
+    "    warmup_steps_dec=WARMUP_STEPS_DEC,\n",
+    "    save_every=SAVE_EVERY,\n",
+    "    report_every=REPORT_EVERY * 5,\n",
+    "    fp16=FP16,\n",
+    "    # checkpoint=\"saved checkpoint path\"\n",
+    ")"
    ]
   },
   {
@@ -327,14 +339,19 @@
     "TEST_TOP_N = 32\n",
     "if not QUICK_RUN:\n",
     "    TEST_TOP_N = len(test_dataset)\n",
+    "\n",
+    "if NUM_GPUS:\n",
+    "    BATCH_SIZE = NUM_GPUS * BATCH_SIZE_PER_GPU\n",
+    "else:\n",
+    "    BATCH_SIZE = 1\n",
     "    \n",
-    "shortened_dataset= test_dataset.shorten(top_n=TEST_TOP_N)\n",
+    "shortened_dataset = test_dataset.shorten(top_n=TEST_TOP_N)\n",
     "src = shortened_dataset.get_source()\n",
     "reference_summaries = [\" \".join(t).rstrip(\"\\n\") for t in shortened_dataset.get_target()]\n",
     "generated_summaries = summarizer.predict(\n",
-    "    shortened_dataset, batch_size=32*4, num_gpus=NUM_GPUS\n",
+    "    shortened_dataset, batch_size=BATCH_SIZE, num_gpus=NUM_GPUS\n",
     ")\n",
-    "assert len(generated_summaries) == len(reference_summaries)\n"
+    "assert len(generated_summaries) == len(reference_summaries)"
    ]
   },
   {
@@ -374,13 +391,6 @@
     "pprint.pprint(rouge_scores)"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -415,39 +425,22 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "scrolled": true
+   },
    "outputs": [],
    "source": [
-    "from utils_nlp.models.transformers.datasets import SummarizationDataset\n",
-    "import nltk\n",
-    "from nltk import tokenize\n",
-    "\n",
     "test_dataset = SummarizationDataset(\n",
-    "    None,\n",
-    "    source=[source],\n",
-    "    source_preprocessing=[tokenize.sent_tokenize],\n",
+    "    None, source=[source], source_preprocessing=[tokenize.sent_tokenize],\n",
     ")\n",
-    "generated_summaries = summarizer.predict(\n",
-    "    test_dataset, batch_size=1, num_gpus=1\n",
-    ")\n"
+    "generated_summaries = summarizer.predict(test_dataset, batch_size=1, num_gpus=NUM_GPUS)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'two employees bought , sold weapons on their own , company says .   company fired workers , turned them in to atf , says it was identified in the feds are sold weapons , entirely genuine \"     u . s . officials say they turned them two miles east - northeast of oakland , while donors are paid just $ 300 to $ 1 , 000 .'"
-      ]
-     },
-     "execution_count": 26,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "generated_summaries[0]"
    ]
@@ -475,9 +468,9 @@
  "metadata": {
   "celltoolbar": "Tags",
   "kernelspec": {
-   "display_name": "python3.6 cm3",
+   "display_name": "Python (nlp_gpu)",
    "language": "python",
-   "name": "cm3"
+   "name": "nlp_gpu"
   },
   "language_info": {
    "codemirror_mode": {
@@ -494,4 +487,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 2
-}
+}
diff --git a/examples/text_summarization/abstractive_summarization_unilm_cnndm.ipynb b/examples/text_summarization/abstractive_summarization_unilm_cnndm.ipynb
@@ -65,7 +65,7 @@
     "import time\n",
     "\n",
     "from utils_nlp.dataset.cnndm import CNNDMSummarizationDatasetOrg\n",
-    "from utils_nlp.models import S2SAbsSumProcessor, S2SAbstractiveSummarizer\n",
+    "from utils_nlp.models.transformers.abstractive_summarization_seq2seq import S2SAbsSumProcessor, S2SAbstractiveSummarizer\n",
     "from utils_nlp.eval import compute_rouge_python\n",
     "\n",
     "start_time = time.time()"

diff --git a/examples/text_summarization/abstractive_summarization_unilm_cnndm.py b/examples/text_summarization/abstractive_summarization_unilm_cnndm.py
@@ -4,7 +4,11 @@
 
 import torch
 
-from utils_nlp.models import S2SAbsSumProcessor, S2SAbstractiveSummarizer
+from utils_nlp.models.transformers.abstractive_summarization_seq2seq import (
+     S2SAbsSumProcessor, 
+     S2SAbstractiveSummarizer
+)
+
 from utils_nlp.eval import compute_rouge_python
 
 parser = argparse.ArgumentParser()

diff --git a/examples/text_summarization/extractive_summarization_cnndm_aml_distributed.ipynb b/examples/text_summarization/extractive_summarization_cnndm_aml_distributed.ipynb
@@ -25,7 +25,9 @@
     "- Azure Machine Learning Workspace\n",
     "- Azure Machine Learning SDK\n",
     "\n",
-    "To run rouge evaluation, please refer to the section of compute_rouge_perl in [summarization_evaluation.ipynb](summarization_evaluation.ipynb). "
+    "To run rouge evaluation, please refer to the section of compute_rouge_perl in [summarization_evaluation.ipynb](summarization_evaluation.ipynb). \n",
+    "\n",
+    "You can run this notebook on CPU-only machines."
    ]
   },
   {
@@ -84,7 +86,9 @@
     "    ExtSumProcessor,\n",
     ")\n",
     "# Check core SDK version number\n",
-    "print(\"SDK version:\", azureml.core.VERSION)"
+    "print(\"SDK version:\", azureml.core.VERSION)\n",
+    "\n",
+    "import pprint"
    ]
   },
   {
@@ -106,7 +110,6 @@
     "RESOURCE_GROUP = \"YOUR_WORKSPACE_NAME\"  # modifiy to use your own\n",
     "WORKSPACE_NAME = \"YOUR_WORKSPACE_REGION\"  # modifiy to use your own\n",
     "\n",
-    "\n",
     "# for creating Azure ML Compute Cluster\n",
     "AMLCOMPUTE_CLUSTER_NAME = \"bertsumext\"  # modifiy to use your own\n",
     "NODE_COUNT = 2\n",
@@ -152,7 +155,7 @@
     "\n",
     "##\n",
     "# The number of lines at the head of data file used for preprocessing. -1 means all the lines.\n",
-    "TOP_N = 1000\n",
+    "TOP_N = 100\n",
     "QUICK_RUN = True\n",
     "if not QUICK_RUN:\n",
     "    TOP_N = -1"
@@ -293,11 +296,11 @@
    "outputs": [],
    "source": [
     "ENTRY_SCRIPT = \"extractive_summarization_cnndm_distributed_train.py\"\n",
-    "!mkdir -p {PROJECT_FOLDER}\n",
-    "!python ../../tools/generate_conda_file.py --gpu --name {CONDA_ENV_NAME}\n",
-    "!cp ./nlp_gpu.yaml {PROJECT_FOLDER}\n",
-    "!cp {ENTRY_SCRIPT} {PROJECT_FOLDER}\n",
-    "!cp -r ../../utils_nlp {PROJECT_FOLDER}"
+    "os.makedirs(PROJECT_FOLDER, exist_ok=True)\n",
+    "os.system(\"python ../../tools/generate_conda_file.py --gpu --name {}\".format(CONDA_ENV_NAME))\n",
+    "os.system(\"cp ./nlp_gpu.yaml {}\".format(PROJECT_FOLDER))\n",
+    "os.system(\"cp {} {}\".format(ENTRY_SCRIPT, PROJECT_FOLDER))\n",
+    "os.system(\"cp -r ../../utils_nlp {}\".format(PROJECT_FOLDER))"
    ]
   },
   {
@@ -397,8 +400,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# need to clear the local output dir as the ds.download won't download if the path exists\n",
-    "!rm -rf {LOCAL_OUTPUT_DIR}/* "
+    "# need to clear the local output dir as the ds.download won't download if the path exists \n",
+    "os.system(\"rm -rf  {}/*\".format(LOCAL_OUTPUT_DIR))"
    ]
   },
   {
@@ -418,10 +421,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "# the script uses <q> as sentence separator so it can write the prediction into the files properly\n",
+    "# here we need to replace <q> with \"\\n\" to prepare for evalation\n",
+    "# removing the ending \"\\n\" is also a preparation step for evalution.\n",
     "prediction = []\n",
     "with open(os.path.join(LOCAL_OUTPUT_DIR, f'{TARGET_OUTPUT_DIR}{SUMMARY_FILENAME}'), \"r\") as filehandle:\n",
     "    for cnt, line in enumerate(filehandle):\n",
-    "        prediction.append(line[0:-1]) # remove the ending \"\\n\""
+    "        prediction.append(line[0:-1].replace(\"<q>\", \"\\n\")) # remove the ending \"\\n\""
    ]
   },
   {
@@ -451,7 +457,7 @@
     "for i in ext_sum_test:\n",
     "    source.append(i[\"src_txt\"]) \n",
     "    temp_target.append(\" \".join(j) for j in i['tgt']) \n",
-    "target = [''.join(i) for i in list(temp_target)]"
+    "target = ['\\n'.join(i) for i in list(temp_target)]"
    ]
   },
   {
@@ -498,13 +504,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# processor = ExtSumProcessor()\n",
+    "BATCH_SIZE = 32\n",
     "summarizer = ExtractiveSummarizer(processor, encoder=ENCODER, cache_dir=LOCAL_CACHE_DIR)\n",
     "summarizer.model.load_state_dict(\n",
     "    torch.load(os.path.join(LOCAL_OUTPUT_DIR, f'{TARGET_OUTPUT_DIR}{MODEL_FILENAME}'),\n",
     "               map_location=\"cpu\"))\n",
     "\n",
-    "prediction = summarizer.predict(test_dataset[0:TOP_N], num_gpus=torch.cuda.device_count(), batch_size=128, sentence_separator = \"\\n\")\n",
+    "prediction = summarizer.predict(ext_sum_test, num_gpus=torch.cuda.device_count(), batch_size=BATCH_SIZE, sentence_separator = \"\\n\")\n",
     "#\"\"\""
    ]
   },