diff --git a/docs/_config.yml b/docs/_config.yml index 11e2f34c..fa8eb2d8 100644 --- a/docs/_config.yml +++ b/docs/_config.yml @@ -39,6 +39,9 @@ collections: workloads: output: true permalink: /workloads/:path/ + users-guide: + output: true + permalink: /users-guide/:path/ # Exclude from processing. diff --git a/docs/_users-guide/spark-submit-config.md b/docs/_users-guide/spark-submit-config.md new file mode 100644 index 00000000..3ef1e7e5 --- /dev/null +++ b/docs/_users-guide/spark-submit-config.md @@ -0,0 +1,106 @@ +--- +layout: page +title: Spark-Submit Configuration +--- + +Under the hood, Spark-Bench converts users' configuration files into a series of spark-submit scripts. +The spark-submit-config section of the configuration file allows users to change the parameters of +those spark-submits. The `class` and `jar` parameters are set by the spark-bench infrastructure. + + + + +- [Parameters](#parameters) +- [spark-home](#spark-home) +- [spark-args](#spark-args) +- [conf](#conf) +- [suites-parallel](#suites-parallel) + + + +## Parameters + +| Name | Required | Description | +| ---------- | ----- | --- | +| spark-home | no | Path to the top level of your Spark installation | +| spark-args | no | Includes master, executor-memory, and other spark-submit arguments | +| conf | no | A series of configuration options for Spark | +| suites-parallel | no | Whether the workload-suites within this spark-submit should run serially or in parallel. Defaults to `false`. | + +## spark-home + +`spark-home` points to the top level directory of your Spark installation. + +This parameter must either be set by the `spark-home` parameter in the configuration file +or in the environment variable `SPARK_HOME`. If both options or set, the configuration file +wins. This allows users to benchmark more than one installation of Spark within one configuration file. + +## spark-args + +`spark-args` contains a series of key-value pairs that reflect arguments users would normally set in their spark-submit scripts. + +To read more about these options in the context of Spark, see the official documentation: + +Probably the most important of these is `master`. Just like in a spark-submit script, users can set master to local, an IP address and port for standalone, or yarn or mesos. + +```hocon +spark-args = { + master = "local[4]" // or local[*], local[2], etc. +} +``` +```hocon +spark-args = { + master = "spark://207.184.161.138:7077" //standalone +} +``` +```hocon +spark-args = { + master = "yarn" + deploy-mode = "cluster" +} +``` +```hocon +spark-args = { + master = "mesos://207.184.161.138:7077" +} +``` + +`master` is the only spark-arg that can also be set in an environment variable. If `SPARK_MASTER_HOST` and `spark-args = { master = ...` +are both set, the configuration file option will win. + +Other spark args include, but are not limited to, `deploy-mode`, `executor-memory`, `num-executors`, `total-executor-cores`, etc. + +## conf + +The `conf` parameter contains a series of pairs of strings representing configuration options for Spark. +These are things that are prefaced by the `--conf` option. + +Example: +```hocon +conf = { + "spark.dynamicAllocation.enabled" = "false" + "spark.shuffle.service.enabled" = "false" +} +``` + +Notice that `conf` is a SERIES of options. To make a list of conf options, users must make a list of objects like so: +```hocon +conf = [ + { + "spark.dynamicAllocation.enabled" = "false" + "spark.shuffle.service.enabled" = "false" + }, + { + "spark.dynamicAllocation.enabled" = "true" + "spark.shuffle.service.enabled" = "true" + } +] +``` +This will create two spark-submit scripts that will have the same workload-suites and other parameters, +but the first will have `"spark.dynamicAllocation.enabled" = "false"` and `"spark.shuffle.service.enabled" = "false"`, +and the second will have `"spark.dynamicAllocation.enabled" = "true"` and `"spark.shuffle.service.enabled" = "true"`. + +## suites-parallel + +`suites-parallel` controls whether the workload suites within this spark-submit run serially or in parallel. +This option defaults to `false` meaning the suites will run serially. diff --git a/docs/user-guide.md b/docs/user-guide.md new file mode 100644 index 00000000..5772d553 --- /dev/null +++ b/docs/user-guide.md @@ -0,0 +1,15 @@ +--- +layout: page +title: User's Guide +permalink: /users-guide/ +--- + + \ No newline at end of file diff --git a/scalastyle-config.xml b/scalastyle-config.xml index f7f764dc..014e86d1 100644 --- a/scalastyle-config.xml +++ b/scalastyle-config.xml @@ -91,11 +91,7 @@ - - - - - + diff --git a/spark-launch/src/main/scala/com/ibm/sparktc/sparkbench/sparklaunch/SparkLaunchConf.scala b/spark-launch/src/main/scala/com/ibm/sparktc/sparkbench/sparklaunch/SparkLaunchConf.scala index b71fadf7..31f074ab 100644 --- a/spark-launch/src/main/scala/com/ibm/sparktc/sparkbench/sparklaunch/SparkLaunchConf.scala +++ b/spark-launch/src/main/scala/com/ibm/sparktc/sparkbench/sparklaunch/SparkLaunchConf.scala @@ -90,7 +90,7 @@ object SparkLaunchConf { fileList.filterNot(_ == whereIAm).head } else if(whereIAm.isEmpty) { - throw new SparkBenchException("Could not determine location for necessary spark-bench jars."); null + throw SparkBenchException("Could not determine location for necessary spark-bench jars."); null } else { /* Assume here that we're in a testing environment. When `sbt test` runs for spark-launch, it'll diff --git a/spark-launch/src/main/scala/com/ibm/sparktc/sparkbench/sparklaunch/SparkLaunchDefaults.scala b/spark-launch/src/main/scala/com/ibm/sparktc/sparkbench/sparklaunch/SparkLaunchDefaults.scala index 161c33c6..cfb2cab4 100644 --- a/spark-launch/src/main/scala/com/ibm/sparktc/sparkbench/sparklaunch/SparkLaunchDefaults.scala +++ b/spark-launch/src/main/scala/com/ibm/sparktc/sparkbench/sparklaunch/SparkLaunchDefaults.scala @@ -25,6 +25,7 @@ object SparkLaunchDefaults { val sparkConf = "conf" val suites = "workload-suites" val suitesParallel = "suites-parallel" + val sparkHome = "spark-home" val suitesParallelDefaultValue = false } diff --git a/spark-launch/src/main/scala/com/ibm/sparktc/sparkbench/sparklaunch/SparkSubmitDeconstructed.scala b/spark-launch/src/main/scala/com/ibm/sparktc/sparkbench/sparklaunch/SparkSubmitDeconstructed.scala index 4486df62..27817b57 100644 --- a/spark-launch/src/main/scala/com/ibm/sparktc/sparkbench/sparklaunch/SparkSubmitDeconstructed.scala +++ b/spark-launch/src/main/scala/com/ibm/sparktc/sparkbench/sparklaunch/SparkSubmitDeconstructed.scala @@ -88,25 +88,25 @@ case class SparkSubmitDeconstructed ( else None } - val suitesParallel = optionallyGetFromJavaMap[Boolean](sparkSubmitOptions,SLD.suitesParallel) - + val sparkHome = optionallyGetFromJavaMap[String](sparkSubmitOptions, SLD.sparkHome) + val suitesParallel = optionallyGetFromJavaMap[Boolean](sparkSubmitOptions, SLD.suitesParallel) val conf: Option[util.Map[String, Any]] = optionallyGetFromJavaMap[util.Map[String, Any]](sparkSubmitOptions, SLD.sparkConf) val sparkArgs: Option[util.Map[String, Any]] = { - val asScala = sparkSubmitOptions.asScala - - val filtered = asScala.filterKeys { case SLD.sparkConf => false case SLD.suitesParallel => false + case SLD.sparkHome => false case _ => true } + if (filtered.isEmpty) None else Some(filtered.asJava) } SparkSubmitPieces( + sparkHome, suitesParallel, conf, sparkArgs, @@ -116,6 +116,7 @@ case class SparkSubmitDeconstructed ( } case class SparkSubmitPieces ( + sparkHome: Option[String], suitesParallel: Option[Boolean], conf: Option[util.Map[String, Any]], sparkArgs: Option[util.Map[String, Any]], @@ -129,6 +130,7 @@ case class SparkSubmitPieces ( Try(key -> option.get).toOption val mostOfIt = Seq( + ifItsThere(SLD.sparkHome, sparkHome), ifItsThere(SLD.suitesParallel, suitesParallel), ifItsThere(SLD.sparkConf, conf), ifItsThere(SLD.sparkArgs, sparkArgs) diff --git a/spark-launch/src/test/resources/etc/specific-spark-home.conf b/spark-launch/src/test/resources/etc/specific-spark-home.conf new file mode 100644 index 00000000..ea498533 --- /dev/null +++ b/spark-launch/src/test/resources/etc/specific-spark-home.conf @@ -0,0 +1,45 @@ +spark-bench = { + spark-submit-config = [{ + spark-home = "/usr/iop/current/spark2-client/" + spark-args = { + master = "yarn" + } + suites-parallel = false + workload-suites = [ + { + descr = "Generate a dataset, then take that same dataset and write it out to Parquet format" + benchmark-output = "/home/dev-user/emily/results-data-gen.csv" + // We need to generate the dataset first through the data generator, then we take that dataset and convert it to Parquet. + parallel = false + workloads = [ + { + name = "data-generation-kmeans" + rows = 100000000 + cols = 24 + output = "hdfs:///tmp/kmeans-data.csv" + }, + { + name = "sql" + queryStr = "select * from input" + input = "hdfs:///tmp/kmeans-data.csv" + output = "hdfs:///tmp/kmeans-data.parquet" + } + ] + }, + { + descr = "Run two different SQL queries over the dataset in two different formats" + benchmark-output = "/home/dev-user/emily/results-sql.csv" + parallel = false + repeat = 10 + workloads = [ + { + name = "sql" + input = ["hdfs:///tmp/kmeans-data.csv", "hdfs:///tmp/kmeans-data.parquet"] + queryStr = ["select * from input", "select `0`, `22` from input where `0` < -0.9"] + cache = false + } + ] + } + ] + }] +} diff --git a/spark-launch/src/test/scala/com/ibm/sparktc/sparkbench/sparklaunch/ConfigWranglerTest.scala b/spark-launch/src/test/scala/com/ibm/sparktc/sparkbench/sparklaunch/ConfigWranglerTest.scala index 354d2618..9fa59176 100644 --- a/spark-launch/src/test/scala/com/ibm/sparktc/sparkbench/sparklaunch/ConfigWranglerTest.scala +++ b/spark-launch/src/test/scala/com/ibm/sparktc/sparkbench/sparklaunch/ConfigWranglerTest.scala @@ -60,7 +60,6 @@ class ConfigWranglerTest extends FlatSpec with Matchers with BeforeAndAfterEach val res1 = seq1.flatMap(ConfigWrangler.processConfig) val res2 = seq2.flatMap(ConfigWrangler.processConfig) - //TODO ugh, this exposed a bug. Well, I guess that's what tests are supposed to do... res1.size shouldBe 5 // 4 resulting from crossjoin plus 1 more from other spark-submit-config res2.size shouldBe 1 } diff --git a/spark-launch/src/test/scala/com/ibm/sparktc/sparkbench/sparklaunch/SparkLaunchConfTest.scala b/spark-launch/src/test/scala/com/ibm/sparktc/sparkbench/sparklaunch/SparkLaunchConfTest.scala index fa576048..16349637 100644 --- a/spark-launch/src/test/scala/com/ibm/sparktc/sparkbench/sparklaunch/SparkLaunchConfTest.scala +++ b/spark-launch/src/test/scala/com/ibm/sparktc/sparkbench/sparklaunch/SparkLaunchConfTest.scala @@ -35,7 +35,9 @@ class SparkLaunchConfTest extends FlatSpec with Matchers with BeforeAndAfter { val field = System.getenv().getClass.getDeclaredField("m") field.setAccessible(true) val map = field.get(System.getenv()).asInstanceOf[java.util.Map[java.lang.String, java.lang.String]] + val value = map.get(key) map.remove(key) + value } before { @@ -82,4 +84,18 @@ class SparkLaunchConfTest extends FlatSpec with Matchers with BeforeAndAfter { SparkLaunch.rmTmpFiles(sparkContextConfs.map(_._2)) } + it should "pick up spark-home as set in the config file" in { + val oldSparkHome = unsetEnv("SPARK_HOME") + val relativePath = "/etc/specific-spark-home.conf" + val resource = new File(getClass.getResource(relativePath).toURI) + val (sparkContextConfs, _) = SparkLaunch.mkConfs(resource) + val conf2 = sparkContextConfs.head._1 + + conf2.sparkHome shouldBe "/usr/iop/current/spark2-client/" + + setEnv("SPARK_HOME", oldSparkHome) + SparkLaunch.rmTmpFiles(sparkContextConfs.map(_._2)) + + } + }