Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Setting spark.app.id to a more intuitive name #124

Merged
merged 3 commits into from
Aug 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 13 additions & 4 deletions service_configuration_lib/spark_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import logging
import math
import os
import re
import time
from typing import Any
from typing import Dict
Expand Down Expand Up @@ -37,6 +38,7 @@

NON_CONFIGURABLE_SPARK_OPTS = {
'spark.master',
'spark.app.id',
'spark.ui.port',
'spark.mesos.principal',
'spark.mesos.secret',
Expand Down Expand Up @@ -1076,20 +1078,27 @@ def get_spark_conf(
_pick_random_port(PREFERRED_SPARK_UI_PORT),
)

spark_conf = {**(spark_opts_from_env or {}), **_filter_user_spark_opts(user_spark_opts)}

if aws_creds[2] is not None:
spark_conf['spark.hadoop.fs.s3a.aws.credentials.provider'] = AWS_ENV_CREDENTIALS_PROVIDER

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is just for moving the following lines down a bit for better code readability:

# app_name from env is already appended port and time to make it unique
app_name = (spark_opts_from_env or {}).get('spark.app.name')
if not app_name:
# We want to make the app name more unique so that we can search it
# from history server.
app_name = f'{app_base_name}_{ui_port}_{int(time.time())}'

# app_name from env is already appended port and time to make it unique
app_name = (spark_opts_from_env or {}).get('spark.app.name')
if not app_name:
# We want to make the app name more unique so that we can search it
# from history server.
app_name = f'{app_base_name}_{ui_port}_{int(time.time())}'

spark_conf = {**(spark_opts_from_env or {}), **_filter_user_spark_opts(user_spark_opts)}

if aws_creds[2] is not None:
spark_conf['spark.hadoop.fs.s3a.aws.credentials.provider'] = AWS_ENV_CREDENTIALS_PROVIDER
# Explicitly setting app id: replace special characters to '_' to make it consistent
# in all places for metric systems:
# - since in the Promehteus metrics endpoint those will be converted to '_'
# - while the 'spark-app-selector' executor pod label will keep the original app id
app_id = re.sub(r'[\.,-]', '_', app_name)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry I missed it earlier.
There is a character limit of 63 for app id and I think 253 for app name. app id needs to be trimmed to make sure it is within the limit.
Secondly, would likely be useful to have app name as: service__instance_timestamp or service__job__id__action_name_timestamp or adhoc/tron jobs.


spark_conf.update({
'spark.app.name': app_name,
'spark.app.id': app_id,
'spark.ui.port': str(ui_port),
})

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

setup(
name='service-configuration-lib',
version='2.18.3',
version='2.18.4',
provides=['service_configuration_lib'],
description='Start, stop, and inspect Yelp SOA services',
url='https://github.com/Yelp/service_configuration_lib',
Expand Down
14 changes: 14 additions & 0 deletions tests/spark_config_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import itertools
import json
import os
import re
import sys
from unittest import mock

Expand Down Expand Up @@ -1138,6 +1139,14 @@ def verify(output):
return [key]
return verify

@pytest.fixture
def assert_app_id(self):
def verify(output):
key = 'spark.app.id'
assert output[key] == re.sub(r'[\.,-]', '_', output['spark.app.name'])
return [key]
return verify

@pytest.fixture
def assert_mesos_conf(self):
def verify(output):
Expand Down Expand Up @@ -1230,6 +1239,7 @@ def test_leaders_get_spark_conf_kubernetes(
mock_time,
assert_ui_port,
assert_app_name,
assert_app_id,
assert_kubernetes_conf,
mock_log,
):
Expand Down Expand Up @@ -1262,6 +1272,7 @@ def test_leaders_get_spark_conf_kubernetes(
verified_keys = set(
assert_ui_port(output) +
assert_app_name(output) +
assert_app_id(output) +
assert_kubernetes_conf(output) +
list(other_spark_opts.keys()) +
list(mock_adjust_spark_requested_resources_kubernetes.return_value.keys()) +
Expand Down Expand Up @@ -1321,6 +1332,7 @@ def test_show_console_progress_jupyter(
mock_time,
assert_ui_port,
assert_app_name,
assert_app_id,
assert_local_conf,
mock_log,
):
Expand Down Expand Up @@ -1361,6 +1373,7 @@ def test_local_spark(
mock_time,
assert_ui_port,
assert_app_name,
assert_app_id,
assert_local_conf,
mock_log,
):
Expand All @@ -1385,6 +1398,7 @@ def test_local_spark(
verified_keys = set(
assert_ui_port(output) +
assert_app_name(output) +
assert_app_id(output) +
assert_local_conf(output) +
list(mock_append_spark_prometheus_conf.return_value.keys()) +
list(mock_append_event_log_conf.return_value.keys()) +
Expand Down
Loading