Skip to content
This repository has been archived by the owner on Nov 16, 2023. It is now read-only.

Commit

Permalink
Merge pull request #576 from microsoft/daden/bugfix
Browse files Browse the repository at this point in the history
Daden/bugfix improvement and bug fix based on the bug bash feedback
  • Loading branch information
daden-ms authored Mar 27, 2020
2 parents 806d5fb + 2748b98 commit 2bc3203
Show file tree
Hide file tree
Showing 9 changed files with 147 additions and 304 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,8 @@
"metadata": {},
"source": [
"## Before you start\n",
"Set QUICK_RUN = True to run the notebook on a small subset of data and a smaller number of steps. If QUICK_RUN = False, the notebook takes about 5 hours to run on a VM with 4 16GB NVIDIA V100 GPUs. Finetuning costs around 1.5 hours and inferecing costs around 3.5 hour. Better performance can be achieved by increasing the MAX_STEPS.\n",
"\n",
"It's recommended to run this notebook on GPU machines as it's very computationally intensive. Set QUICK_RUN = True to run the notebook on a small subset of data and a smaller number of steps. If QUICK_RUN = False, the notebook takes about 5 hours to run on a VM with 4 16GB NVIDIA V100 GPUs. Finetuning costs around 1.5 hours and inferecing costs around 3.5 hour. Better performance can be achieved by increasing the MAX_STEPS.\n",
"\n",
"* **ROUGE Evalation**: To run rouge evaluation, please refer to the section of compute_rouge_perl in [summarization_evaluation.ipynb](./summarization_evaluation.ipynb) for setup.\n",
"\n",
Expand Down Expand Up @@ -92,11 +93,18 @@
"if nlp_path not in sys.path:\n",
" sys.path.insert(0, nlp_path)\n",
"\n",
"from utils_nlp.models.transformers.abstractive_summarization_bertsum import BertSumAbs, BertSumAbsProcessor\n",
"from utils_nlp.models.transformers.abstractive_summarization_bertsum import (\n",
" BertSumAbs,\n",
" BertSumAbsProcessor,\n",
")\n",
"\n",
"from utils_nlp.dataset.cnndm import CNNDMSummarizationDataset\n",
"from utils_nlp.eval import compute_rouge_python\n",
"\n",
"from utils_nlp.models.transformers.datasets import SummarizationDataset\n",
"import nltk\n",
"from nltk import tokenize\n",
"\n",
"import pandas as pd\n",
"import pprint\n",
"import scrapbook as sb"
Expand Down Expand Up @@ -139,8 +147,8 @@
"outputs": [],
"source": [
"train_dataset, test_dataset = CNNDMSummarizationDataset(\n",
" top_n=TOP_N, local_cache_path=DATA_PATH, prepare_extractive=False\n",
" )"
" top_n=TOP_N, local_cache_path=DATA_PATH, prepare_extractive=False\n",
")"
]
},
{
Expand Down Expand Up @@ -190,36 +198,41 @@
"MAX_SOURCE_SEQ_LENGTH = 640\n",
"MAX_TARGET_SEQ_LENGTH = 140\n",
"\n",
"# mixed precision setting. To enable mixed precision training, follow instructions in SETUP.md. \n",
"# mixed precision setting. To enable mixed precision training, follow instructions in SETUP.md.\n",
"FP16 = False\n",
"if FP16:\n",
" FP16_OPT_LEVEL=\"O2\"\n",
" \n",
" FP16_OPT_LEVEL = \"O2\"\n",
"\n",
"# fine-tuning parameters\n",
"# batch size, unit is the number of tokens\n",
"BATCH_SIZE_PER_GPU = 3\n",
"BATCH_SIZE_PER_GPU = 1\n",
"\n",
"\n",
"# GPU used for training\n",
"NUM_GPUS = torch.cuda.device_count()\n",
"if NUM_GPUS > 0:\n",
" BATCH_SIZE = NUM_GPUS * BATCH_SIZE_PER_GPU\n",
"else:\n",
" BATCH_SIZE = 1\n",
"\n",
"\n",
"# Learning rate\n",
"LEARNING_RATE_BERT=5e-4/2.0\n",
"LEARNING_RATE_DEC=0.05/2.0\n",
"LEARNING_RATE_BERT = 5e-4 / 2.0\n",
"LEARNING_RATE_DEC = 0.05 / 2.0\n",
"\n",
"\n",
"# How often the statistics reports show up in training, unit is step.\n",
"REPORT_EVERY=10\n",
"SAVE_EVERY=500\n",
"REPORT_EVERY = 10\n",
"SAVE_EVERY = 500\n",
"\n",
"# total number of steps for training\n",
"MAX_STEPS=1e3\n",
" \n",
"MAX_STEPS = 1e3\n",
"\n",
"if not QUICK_RUN:\n",
" MAX_STEPS=5e3\n",
" MAX_STEPS = 5e3\n",
"\n",
"WARMUP_STEPS_BERT=2000\n",
"WARMUP_STEPS_DEC=1000 \n"
"WARMUP_STEPS_BERT = 2000\n",
"WARMUP_STEPS_DEC = 1000"
]
},
{
Expand Down Expand Up @@ -253,21 +266,20 @@
},
"outputs": [],
"source": [
"\n",
"summarizer.fit(\n",
" train_dataset,\n",
" num_gpus=NUM_GPUS,\n",
" batch_size=BATCH_SIZE_PER_GPU*NUM_GPUS,\n",
" max_steps=MAX_STEPS,\n",
" learning_rate_bert=LEARNING_RATE_BERT,\n",
" learning_rate_dec=LEARNING_RATE_DEC,\n",
" warmup_steps_bert=WARMUP_STEPS_BERT,\n",
" warmup_steps_dec=WARMUP_STEPS_DEC,\n",
" save_every=SAVE_EVERY,\n",
" report_every=REPORT_EVERY*5,\n",
" fp16=FP16,\n",
" # checkpoint=\"saved checkpoint path\"\n",
")\n"
" train_dataset,\n",
" num_gpus=NUM_GPUS,\n",
" batch_size=BATCH_SIZE,\n",
" max_steps=MAX_STEPS,\n",
" learning_rate_bert=LEARNING_RATE_BERT,\n",
" learning_rate_dec=LEARNING_RATE_DEC,\n",
" warmup_steps_bert=WARMUP_STEPS_BERT,\n",
" warmup_steps_dec=WARMUP_STEPS_DEC,\n",
" save_every=SAVE_EVERY,\n",
" report_every=REPORT_EVERY * 5,\n",
" fp16=FP16,\n",
" # checkpoint=\"saved checkpoint path\"\n",
")"
]
},
{
Expand Down Expand Up @@ -327,14 +339,19 @@
"TEST_TOP_N = 32\n",
"if not QUICK_RUN:\n",
" TEST_TOP_N = len(test_dataset)\n",
"\n",
"if NUM_GPUS:\n",
" BATCH_SIZE = NUM_GPUS * BATCH_SIZE_PER_GPU\n",
"else:\n",
" BATCH_SIZE = 1\n",
" \n",
"shortened_dataset= test_dataset.shorten(top_n=TEST_TOP_N)\n",
"shortened_dataset = test_dataset.shorten(top_n=TEST_TOP_N)\n",
"src = shortened_dataset.get_source()\n",
"reference_summaries = [\" \".join(t).rstrip(\"\\n\") for t in shortened_dataset.get_target()]\n",
"generated_summaries = summarizer.predict(\n",
" shortened_dataset, batch_size=32*4, num_gpus=NUM_GPUS\n",
" shortened_dataset, batch_size=BATCH_SIZE, num_gpus=NUM_GPUS\n",
")\n",
"assert len(generated_summaries) == len(reference_summaries)\n"
"assert len(generated_summaries) == len(reference_summaries)"
]
},
{
Expand Down Expand Up @@ -374,13 +391,6 @@
"pprint.pprint(rouge_scores)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
Expand Down Expand Up @@ -415,39 +425,22 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"from utils_nlp.models.transformers.datasets import SummarizationDataset\n",
"import nltk\n",
"from nltk import tokenize\n",
"\n",
"test_dataset = SummarizationDataset(\n",
" None,\n",
" source=[source],\n",
" source_preprocessing=[tokenize.sent_tokenize],\n",
" None, source=[source], source_preprocessing=[tokenize.sent_tokenize],\n",
")\n",
"generated_summaries = summarizer.predict(\n",
" test_dataset, batch_size=1, num_gpus=1\n",
")\n"
"generated_summaries = summarizer.predict(test_dataset, batch_size=1, num_gpus=NUM_GPUS)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'two employees bought , sold weapons on their own , company says . company fired workers , turned them in to atf , says it was identified in the feds are sold weapons , entirely genuine \" u . s . officials say they turned them two miles east - northeast of oakland , while donors are paid just $ 300 to $ 1 , 000 .'"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"generated_summaries[0]"
]
Expand Down Expand Up @@ -475,9 +468,9 @@
"metadata": {
"celltoolbar": "Tags",
"kernelspec": {
"display_name": "python3.6 cm3",
"display_name": "Python (nlp_gpu)",
"language": "python",
"name": "cm3"
"name": "nlp_gpu"
},
"language_info": {
"codemirror_mode": {
Expand All @@ -494,4 +487,4 @@
},
"nbformat": 4,
"nbformat_minor": 2
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@
"import time\n",
"\n",
"from utils_nlp.dataset.cnndm import CNNDMSummarizationDatasetOrg\n",
"from utils_nlp.models import S2SAbsSumProcessor, S2SAbstractiveSummarizer\n",
"from utils_nlp.models.transformers.abstractive_summarization_seq2seq import S2SAbsSumProcessor, S2SAbstractiveSummarizer\n",
"from utils_nlp.eval import compute_rouge_python\n",
"\n",
"start_time = time.time()"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,11 @@

import torch

from utils_nlp.models import S2SAbsSumProcessor, S2SAbstractiveSummarizer
from utils_nlp.models.transformers.abstractive_summarization_seq2seq import (
S2SAbsSumProcessor,
S2SAbstractiveSummarizer
)

from utils_nlp.eval import compute_rouge_python

parser = argparse.ArgumentParser()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,9 @@
"- Azure Machine Learning Workspace\n",
"- Azure Machine Learning SDK\n",
"\n",
"To run rouge evaluation, please refer to the section of compute_rouge_perl in [summarization_evaluation.ipynb](summarization_evaluation.ipynb). "
"To run rouge evaluation, please refer to the section of compute_rouge_perl in [summarization_evaluation.ipynb](summarization_evaluation.ipynb). \n",
"\n",
"You can run this notebook on CPU-only machines."
]
},
{
Expand Down Expand Up @@ -84,7 +86,9 @@
" ExtSumProcessor,\n",
")\n",
"# Check core SDK version number\n",
"print(\"SDK version:\", azureml.core.VERSION)"
"print(\"SDK version:\", azureml.core.VERSION)\n",
"\n",
"import pprint"
]
},
{
Expand All @@ -106,7 +110,6 @@
"RESOURCE_GROUP = \"YOUR_WORKSPACE_NAME\" # modifiy to use your own\n",
"WORKSPACE_NAME = \"YOUR_WORKSPACE_REGION\" # modifiy to use your own\n",
"\n",
"\n",
"# for creating Azure ML Compute Cluster\n",
"AMLCOMPUTE_CLUSTER_NAME = \"bertsumext\" # modifiy to use your own\n",
"NODE_COUNT = 2\n",
Expand Down Expand Up @@ -152,7 +155,7 @@
"\n",
"##\n",
"# The number of lines at the head of data file used for preprocessing. -1 means all the lines.\n",
"TOP_N = 1000\n",
"TOP_N = 100\n",
"QUICK_RUN = True\n",
"if not QUICK_RUN:\n",
" TOP_N = -1"
Expand Down Expand Up @@ -293,11 +296,11 @@
"outputs": [],
"source": [
"ENTRY_SCRIPT = \"extractive_summarization_cnndm_distributed_train.py\"\n",
"!mkdir -p {PROJECT_FOLDER}\n",
"!python ../../tools/generate_conda_file.py --gpu --name {CONDA_ENV_NAME}\n",
"!cp ./nlp_gpu.yaml {PROJECT_FOLDER}\n",
"!cp {ENTRY_SCRIPT} {PROJECT_FOLDER}\n",
"!cp -r ../../utils_nlp {PROJECT_FOLDER}"
"os.makedirs(PROJECT_FOLDER, exist_ok=True)\n",
"os.system(\"python ../../tools/generate_conda_file.py --gpu --name {}\".format(CONDA_ENV_NAME))\n",
"os.system(\"cp ./nlp_gpu.yaml {}\".format(PROJECT_FOLDER))\n",
"os.system(\"cp {} {}\".format(ENTRY_SCRIPT, PROJECT_FOLDER))\n",
"os.system(\"cp -r ../../utils_nlp {}\".format(PROJECT_FOLDER))"
]
},
{
Expand Down Expand Up @@ -397,8 +400,8 @@
"metadata": {},
"outputs": [],
"source": [
"# need to clear the local output dir as the ds.download won't download if the path exists\n",
"!rm -rf {LOCAL_OUTPUT_DIR}/* "
"# need to clear the local output dir as the ds.download won't download if the path exists \n",
"os.system(\"rm -rf {}/*\".format(LOCAL_OUTPUT_DIR))"
]
},
{
Expand All @@ -418,10 +421,13 @@
"metadata": {},
"outputs": [],
"source": [
"# the script uses <q> as sentence separator so it can write the prediction into the files properly\n",
"# here we need to replace <q> with \"\\n\" to prepare for evalation\n",
"# removing the ending \"\\n\" is also a preparation step for evalution.\n",
"prediction = []\n",
"with open(os.path.join(LOCAL_OUTPUT_DIR, f'{TARGET_OUTPUT_DIR}{SUMMARY_FILENAME}'), \"r\") as filehandle:\n",
" for cnt, line in enumerate(filehandle):\n",
" prediction.append(line[0:-1]) # remove the ending \"\\n\""
" prediction.append(line[0:-1].replace(\"<q>\", \"\\n\")) # remove the ending \"\\n\""
]
},
{
Expand Down Expand Up @@ -451,7 +457,7 @@
"for i in ext_sum_test:\n",
" source.append(i[\"src_txt\"]) \n",
" temp_target.append(\" \".join(j) for j in i['tgt']) \n",
"target = [''.join(i) for i in list(temp_target)]"
"target = ['\\n'.join(i) for i in list(temp_target)]"
]
},
{
Expand Down Expand Up @@ -498,13 +504,13 @@
"metadata": {},
"outputs": [],
"source": [
"# processor = ExtSumProcessor()\n",
"BATCH_SIZE = 32\n",
"summarizer = ExtractiveSummarizer(processor, encoder=ENCODER, cache_dir=LOCAL_CACHE_DIR)\n",
"summarizer.model.load_state_dict(\n",
" torch.load(os.path.join(LOCAL_OUTPUT_DIR, f'{TARGET_OUTPUT_DIR}{MODEL_FILENAME}'),\n",
" map_location=\"cpu\"))\n",
"\n",
"prediction = summarizer.predict(test_dataset[0:TOP_N], num_gpus=torch.cuda.device_count(), batch_size=128, sentence_separator = \"\\n\")\n",
"prediction = summarizer.predict(ext_sum_test, num_gpus=torch.cuda.device_count(), batch_size=BATCH_SIZE, sentence_separator = \"\\n\")\n",
"#\"\"\""
]
},
Expand Down
Loading

0 comments on commit 2bc3203

Please sign in to comment.