From ffaf3ffb5e4fc23665d08fa3f116466cabf966b1 Mon Sep 17 00:00:00 2001 From: Enrico Usai <usai@amazon.com> Date: Thu, 8 Nov 2018 14:28:12 +0100 Subject: [PATCH 1/8] Rename CfnCluster to AWS ParallelCluster + change package name from cfncluster-node to aws-parallelcluster-node + change folder from /opt/cfncluster to /opt/parallelcluster + change event messages from cfncluster to parallelcluster + change github repos from awslabs/cfncluster-* to aws/aws-parallelcluster-* Signed-off-by: Enrico Usai <usai@amazon.com> --- CHANGELOG.md | 6 +++--- CONTRIBUTING.md | 6 +++--- NOTICE.txt | 4 ++-- README.rst | 12 ++++++------ jobwatcher/jobwatcher.py | 9 +++++---- jobwatcher/plugins/test.py | 2 +- setup.py | 28 ++++++++++++++-------------- sqswatcher/plugins/sge.py | 2 +- sqswatcher/sqswatcher.py | 2 +- 9 files changed, 36 insertions(+), 35 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b1ce2a14f..245a9010e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,7 @@ -cfncluster-node CHANGELOG -========================= +aws-parallelcluster-node CHANGELOG +=================================== -This file is used to list changes made in each version of the cfncluster-node package. +This file is used to list changes made in each version of the aws-parallelcluster-node package. 1.6.0 ----- diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 10d30495d..e74c56974 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -11,7 +11,7 @@ information to effectively respond to your bug report or contribution. We welcome you to use the GitHub issue tracker to report bugs or suggest features. -When filing an issue, please check [existing open](https://github.com/awslabs/cfncluster-node/issues), or [recently closed](https://github.com/awslabs/cfncluster-node/issues?utf8=%E2%9C%93&q=is%3Aissue%20is%3Aclosed%20), issues to make sure somebody else hasn't already +When filing an issue, please check [existing open](https://github.com/aws/aws-parallelcluster-node/issues), or [recently closed](https://github.com/aws/aws-parallelcluster-node/issues?utf8=%E2%9C%93&q=is%3Aissue%20is%3Aclosed%20), issues to make sure somebody else hasn't already reported the issue. Please try to include as much information as you can. Details like these are incredibly useful: * A reproducible test case or series of steps @@ -41,7 +41,7 @@ GitHub provides additional document on [forking a repository](https://help.githu ## Finding contributions to work on -Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels ((enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any ['help wanted'](https://github.com/awslabs/cfncluster-node/labels/help%20wanted) issues is a great place to start. +Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels ((enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any ['help wanted'](https://github.com/aws/aws-parallelcluster-node/labels/help%20wanted) issues is a great place to start. ## Code of Conduct @@ -56,6 +56,6 @@ If you discover a potential security issue in this project we ask that you notif ## Licensing -See the [LICENSE](https://github.com/awslabs/cfncluster-node/blob/develop/LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution. +See the [LICENSE](https://github.com/aws/aws-parallelcluster-node/blob/develop/LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution. We may ask you to sign a [Contributor License Agreement (CLA)](http://en.wikipedia.org/wiki/Contributor_License_Agreement) for larger changes. diff --git a/NOTICE.txt b/NOTICE.txt index efe2fbd95..e2e757506 100644 --- a/NOTICE.txt +++ b/NOTICE.txt @@ -1,2 +1,2 @@ -cfncluster-node -Copyright 2014-2016 Amazon.com, Inc. or its affiliates. All Rights Reserved. +aws-parallelcluster-node +Copyright 2014-2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. diff --git a/README.rst b/README.rst index db11e3f0a..fed9daa42 100644 --- a/README.rst +++ b/README.rst @@ -1,9 +1,9 @@ -=============== -cfncluster-node -=============== +========================= +aws-parallelcluster-node +========================= -.. image:: https://travis-ci.org/awslabs/cfncluster-node.png?branch=develop - :target: https://travis-ci.org/awslabs/cfncluster-node +.. image:: https://travis-ci.org/aws/aws-parallelcluster-node.png?branch=develop + :target: https://travis-ci.org/aws/aws-parallelcluster-node :alt: Build Status -cfncluster-node is the python package installed on the Amazon EC2 instances launched as part of CfnCluster. + aws-parallelcluster-node is the python package installed on the Amazon EC2 instances launched as part of AWS ParallelCluster. diff --git a/jobwatcher/jobwatcher.py b/jobwatcher/jobwatcher.py index 859213b3b..9ffd92798 100644 --- a/jobwatcher/jobwatcher.py +++ b/jobwatcher/jobwatcher.py @@ -24,8 +24,8 @@ from botocore.config import Config log = logging.getLogger(__name__) -pricing_file = '/opt/cfncluster/instances.json' -cfnconfig_file = '/opt/cfncluster/cfnconfig' +pricing_file = '/opt/parallelcluster/instances.json' +cfnconfig_file = '/opt/parallelcluster/cfnconfig' def load_scheduler_module(scheduler): @@ -112,12 +112,13 @@ def fetch_pricing_file(proxy_config, cfncluster_dir, region): except OSError as ex: log.critical('Could not create directory %s. Failed with exception: %s' % (cfncluster_dir, ex)) raise - bucket_name = '%s-cfncluster' % region + bucket_name = '%s-aws-parallelcluster' % region try: bucket = s3.Bucket(bucket_name) bucket.download_file('instances/instances.json', '%s/instances.json' % cfncluster_dir) except ClientError as e: - log.critical("Could not save instance mapping file %s/instances.json from S3 bucket %s. Failed with exception: %s" % (cfncluster_dir, bucket_name, e)) + log.critical("Could not save instance mapping file %s/instances.json from S3 bucket %s. " + "Failed with exception: %s" % (cfncluster_dir, bucket_name, e)) raise diff --git a/jobwatcher/plugins/test.py b/jobwatcher/plugins/test.py index 92235b9de..b0034925e 100644 --- a/jobwatcher/plugins/test.py +++ b/jobwatcher/plugins/test.py @@ -31,7 +31,7 @@ def get_busy_nodes(instance_properties): def nodes(slots, instance_properties): if slots <= 0: return 0 - with open('/opt/cfncluster/instances.json') as f: + with open('/opt/parallelcluster/instances.json') as f: instances = json.load(f) vcpus = int(instances[instance_type]["vcpus"]) log.info("Instance %s has %s slots." % (instance_type, vcpus)) diff --git a/setup.py b/setup.py index d2dd8945d..036525feb 100644 --- a/setup.py +++ b/setup.py @@ -38,22 +38,22 @@ def read(fname): requires.append('paramiko>=2.4.2') setup( - name = "cfncluster-node", - version = version, - author = "Dougal Ballantyne", - author_email = "dougalb@amazon.com", - description = ("cfncluster-node provides the scripts for a cfncluster node."), - url = ("https://github.com/awslabs/cfncluster"), - license = "Apache License 2.0", - packages = find_packages(), - install_requires = requires, + name="aws-parallelcluster-node", + version=version, + author="Amazon Web Services", + description="aws-parallelcluster-node provides the scripts for an AWS ParallelCluster node.", + url="https://github.com/aws/aws-parallelcluster-node", + license="Apache License 2.0", + packages=find_packages(), + install_requires=requires, entry_points=dict(console_scripts=console_scripts), - include_package_data = True, - zip_safe = False, - package_data = { - '' : ['examples/config'], + include_package_data=True, + zip_safe=False, + package_data={ + '': ['examples/config'], }, - long_description = ("cfncluster-node is the python package installed on the Amazon EC2 instances launched as part of CfnCluster."), + long_description=("aws-parallelcluster-node is the python package installed on the Amazon EC2 instances launched " + "as part of AWS ParallelCluster."), classifiers=[ "Development Status :: 5 - Production/Stable", "Environment :: Console", diff --git a/sqswatcher/plugins/sge.py b/sqswatcher/plugins/sge.py index 3b94fd705..c016e83dc 100644 --- a/sqswatcher/plugins/sge.py +++ b/sqswatcher/plugins/sge.py @@ -89,7 +89,7 @@ def addHost(hostname, cluster_user, slots): ssh._host_keys_filename = None pass ssh.save_host_keys(hosts_key_file) - command = "sudo sh -c \'cd /opt/sge && /opt/sge/inst_sge -noremote -x -auto /opt/cfncluster/templates/sge/sge_inst.conf\'" + command = "sudo sh -c \'cd /opt/sge && /opt/sge/inst_sge -noremote -x -auto /opt/parallelcluster/templates/sge/sge_inst.conf\'" stdin, stdout, stderr = ssh.exec_command(command) while not stdout.channel.exit_status_ready(): time.sleep(1) diff --git a/sqswatcher/sqswatcher.py b/sqswatcher/sqswatcher.py index 32532c6ef..dee9f6911 100755 --- a/sqswatcher/sqswatcher.py +++ b/sqswatcher/sqswatcher.py @@ -134,7 +134,7 @@ def pollQueue(scheduler, q, t, proxy_config): log.info("eventType=%s" % eventType) if eventType == 'autoscaling:TEST_NOTIFICATION': message.delete() - elif eventType == 'cfncluster:COMPUTE_READY': + elif eventType == 'parallelcluster:COMPUTE_READY': instanceId = message_attrs.get('EC2InstanceId') slots = message_attrs.get('Slots') log.info("instanceId=%s" % instanceId) From 611e431e9ffaf17a2d35632e1fb0a8799e72fc07 Mon Sep 17 00:00:00 2001 From: Sean Smith <seaam@amazon.com> Date: Thu, 8 Nov 2018 15:12:31 -0800 Subject: [PATCH 2/8] Release Node Version 2.0.0rc1 Signed-off-by: Sean Smith <seaam@amazon.com> --- CHANGELOG.md | 6 ++++++ setup.py | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 245a9010e..176b0980f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,12 @@ aws-parallelcluster-node CHANGELOG This file is used to list changes made in each version of the aws-parallelcluster-node package. +2.0.0 +----- + +- Rename package to AWS ParallelCluster + + 1.6.0 ----- diff --git a/setup.py b/setup.py index 036525feb..abac92695 100644 --- a/setup.py +++ b/setup.py @@ -25,7 +25,7 @@ def read(fname): console_scripts = ['sqswatcher = sqswatcher.sqswatcher:main', 'nodewatcher = nodewatcher.nodewatcher:main', 'jobwatcher = jobwatcher.jobwatcher:main'] -version = "1.6.0" +version = "2.0.0rc1" requires = ['boto3>=1.7.55', 'python-dateutil>=2.6.1'] if sys.version_info[:2] == (2, 6): From 2e445642da9ba766be9c85b35f6c417dd0424f6f Mon Sep 17 00:00:00 2001 From: Sean Smith <seaam@amazon.com> Date: Mon, 12 Nov 2018 00:34:37 -0600 Subject: [PATCH 3/8] Release Version 2.0.0 Signed-off-by: Sean Smith <seaam@amazon.com> --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index abac92695..587e35d22 100644 --- a/setup.py +++ b/setup.py @@ -25,7 +25,7 @@ def read(fname): console_scripts = ['sqswatcher = sqswatcher.sqswatcher:main', 'nodewatcher = nodewatcher.nodewatcher:main', 'jobwatcher = jobwatcher.jobwatcher:main'] -version = "2.0.0rc1" +version = "2.0.0" requires = ['boto3>=1.7.55', 'python-dateutil>=2.6.1'] if sys.version_info[:2] == (2, 6): From db163e903a491d6ee62cc8f0299a7a578fe23cef Mon Sep 17 00:00:00 2001 From: Enrico Usai <usai@amazon.com> Date: Wed, 14 Nov 2018 10:24:23 +0100 Subject: [PATCH 4/8] Align readme to aws-parallelcluster main package Signed-off-by: Enrico Usai <usai@amazon.com> --- README.rst | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/README.rst b/README.rst index fed9daa42..26e17c8b4 100644 --- a/README.rst +++ b/README.rst @@ -1,9 +1,14 @@ -========================= -aws-parallelcluster-node -========================= +======================== +AWS ParallelCluster Node +======================== -.. image:: https://travis-ci.org/aws/aws-parallelcluster-node.png?branch=develop - :target: https://travis-ci.org/aws/aws-parallelcluster-node - :alt: Build Status +|Build Status| |Version| - aws-parallelcluster-node is the python package installed on the Amazon EC2 instances launched as part of AWS ParallelCluster. +.. |Build Status| image:: https://travis-ci.org/aws/aws-parallelcluster-node.png?branch=develop + :target: https://travis-ci.org/aws/aws-parallelcluster-node/ + :alt: Build Status +.. |Version| image:: https://badge.fury.io/py/aws-parallelcluster-node.png + :target: https://badge.fury.io/py/aws-parallelcluster-node + +This repo contains the aws-parallelcluster-node package installed on the Amazon EC2 instances launched +as part of AWS ParallelCluster. \ No newline at end of file From 059ed65c6c0bd1ef394ce0464bce4c72cc80121d Mon Sep 17 00:00:00 2001 From: Maurizio Melato <mmelato@amazon.com> Date: Fri, 16 Nov 2018 12:56:11 +0100 Subject: [PATCH 5/8] BUmp version to 2.0.2 Signed-off-by: Maurizio Melato <mmelato@amazon.com> --- CHANGELOG.md | 5 +++++ setup.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 176b0980f..0ea42db20 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,11 @@ aws-parallelcluster-node CHANGELOG This file is used to list changes made in each version of the aws-parallelcluster-node package. +2.0.2 +----- + +- Align version to main ParallelCluster package + 2.0.0 ----- diff --git a/setup.py b/setup.py index 587e35d22..3bd7845b1 100644 --- a/setup.py +++ b/setup.py @@ -25,7 +25,7 @@ def read(fname): console_scripts = ['sqswatcher = sqswatcher.sqswatcher:main', 'nodewatcher = nodewatcher.nodewatcher:main', 'jobwatcher = jobwatcher.jobwatcher:main'] -version = "2.0.0" +version = "2.0.2" requires = ['boto3>=1.7.55', 'python-dateutil>=2.6.1'] if sys.version_info[:2] == (2, 6): From 18c5d8037c7767a32e4d4b33e24f168e8c06cb7d Mon Sep 17 00:00:00 2001 From: Balaji Sridharan <fnubalaj@amazon.com> Date: Fri, 30 Nov 2018 16:15:09 -0800 Subject: [PATCH 6/8] Fix bug where jobs are allocated to nodes during termination and check stack for updates Signed-off-by: Balaji Sridharan <fnubalaj@amazon.com> --- nodewatcher/nodewatcher.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/nodewatcher/nodewatcher.py b/nodewatcher/nodewatcher.py index 5ec63610f..bb9764d0d 100755 --- a/nodewatcher/nodewatcher.py +++ b/nodewatcher/nodewatcher.py @@ -152,7 +152,8 @@ def stackCreationComplete(stack_name, region, proxy_config): log.info('Checking for status of the stack %s' % stack_name) cfn_client = boto3.client('cloudformation', region_name=region, config=proxy_config) stacks = cfn_client.describe_stacks(StackName=stack_name) - return stacks['Stacks'][0]['StackStatus'] == 'CREATE_COMPLETE' + return stacks['Stacks'][0]['StackStatus'] == 'CREATE_COMPLETE' or \ + stacks['Stacks'][0]['StackStatus'] == 'UPDATE_COMPLETE' def main(): @@ -183,7 +184,13 @@ def main(): data = {_CURRENT_IDLETIME: 0} stack_creation_complete = False + termination_in_progress = False while True: + # if this node is terminating sleep for a long time and wait for termination + if termination_in_progress: + time.sleep(300) + log.info('%s is still terminating' % hostname) + continue time.sleep(60) if not stack_creation_complete: stack_creation_complete = stackCreationComplete(stack_name, region, proxy_config) @@ -217,6 +224,7 @@ def main(): os.remove(_IDLETIME_FILE) try: selfTerminate(asg_name, asg_conn, instance_id) + termination_in_progress = True except ClientError as ex: log.error('Failed to terminate instance: %s with exception %s' % (instance_id, ex)) lockHost(s, hostname, unlock=True) From f5921be908ab2cdf095ffec94cf32a437c24397e Mon Sep 17 00:00:00 2001 From: Sean Smith <seaam@amazon.com> Date: Fri, 14 Dec 2018 09:51:53 -0800 Subject: [PATCH 7/8] Release 2.1.0 Signed-off-by: Sean Smith <seaam@amazon.com> --- CHANGELOG.md | 7 +++++++ setup.py | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0ea42db20..693ceb374 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,13 @@ aws-parallelcluster-node CHANGELOG This file is used to list changes made in each version of the aws-parallelcluster-node package. +2.1.0 +----- + +Bug Fixes: + + - Don't schedule jobs on compute nodes that are terminating + 2.0.2 ----- diff --git a/setup.py b/setup.py index 3bd7845b1..9432beb6e 100644 --- a/setup.py +++ b/setup.py @@ -25,7 +25,7 @@ def read(fname): console_scripts = ['sqswatcher = sqswatcher.sqswatcher:main', 'nodewatcher = nodewatcher.nodewatcher:main', 'jobwatcher = jobwatcher.jobwatcher:main'] -version = "2.0.2" +version = "2.1.0" requires = ['boto3>=1.7.55', 'python-dateutil>=2.6.1'] if sys.version_info[:2] == (2, 6): From e7bd3f02201b9e967765a844a7df0b2857f1b9be Mon Sep 17 00:00:00 2001 From: Sean Smith <seaam@amazon.com> Date: Fri, 14 Dec 2018 10:00:05 -0800 Subject: [PATCH 8/8] Pin idna version Signed-off-by: Sean Smith <seaam@amazon.com> --- requirements26.txt | 3 ++- setup.py | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/requirements26.txt b/requirements26.txt index 9a5a4e2b1..cfc503511 100644 --- a/requirements26.txt +++ b/requirements26.txt @@ -1,2 +1,3 @@ pycparser==2.18 -paramiko==2.3.3 +idna==2.6 +paramiko==2.3.3 \ No newline at end of file diff --git a/setup.py b/setup.py index 9432beb6e..6e7164c89 100644 --- a/setup.py +++ b/setup.py @@ -32,6 +32,7 @@ def read(fname): # For python2.6 we have to require argparse since it # was not in stdlib until 2.7. requires.append('argparse>=1.4') + requires.append('idna==2.6') requires.append('paramiko==2.3.3') requires.append('pycparser==2.18') else: