Fixes spark-home param, updates docs accordingly

This commit fixes a bug in which spark-home was not being picked up from the configuration file, forcing users to set the environment variable SPARK_HOME. The docs are also updated to include a guide to the parameters in spark-submit-config, including spark-home.
CODAIT · Oct 4, 2017 · 3b0cfac · 3b0cfac
1 parent dc59441
commit 3b0cfac
Show file tree

Hide file tree

Showing 10 changed files with 195 additions and 12 deletions.
diff --git a/docs/_config.yml b/docs/_config.yml
@@ -39,6 +39,9 @@ collections:
   workloads:
     output: true
     permalink: /workloads/:path/
+  users-guide:
+    output: true
+    permalink: /users-guide/:path/
 
 
 # Exclude from processing.

diff --git a/docs/_users-guide/spark-submit-config.md b/docs/_users-guide/spark-submit-config.md
@@ -0,0 +1,106 @@
+---
+layout: page
+title: Spark-Submit Configuration
+---
+
+Under the hood, Spark-Bench converts users' configuration files into a series of spark-submit scripts.
+The spark-submit-config section of the configuration file allows users to change the parameters of 
+those spark-submits. The `class` and `jar` parameters are set by the spark-bench infrastructure. 
+
+<!-- START doctoc generated TOC please keep comment here to allow auto update -->
+<!-- DON'T EDIT THIS SECTION, INSTEAD RE-RUN doctoc TO UPDATE -->
+
+- [Parameters](#parameters)
+- [spark-home](#spark-home)
+- [spark-args](#spark-args)
+- [conf](#conf)
+- [suites-parallel](#suites-parallel)
+
+<!-- END doctoc generated TOC please keep comment here to allow auto update -->
+
+## Parameters
+
+| Name    | Required | Description |  
+| ---------- | ----- | --- |    
+| spark-home  | no | Path to the top level of your Spark installation |  
+| spark-args  | no | Includes master, executor-memory, and other spark-submit arguments |  
+| conf        | no | A series of configuration options for Spark |  
+| suites-parallel | no | Whether the workload-suites within this spark-submit should run serially or in parallel. Defaults to `false`. |   
+
+## spark-home
+
+`spark-home` points to the top level directory of your Spark installation. 
+
+This parameter must either be set by the `spark-home` parameter in the configuration file
+or in the environment variable `SPARK_HOME`. If both options or set, the configuration file
+wins. This allows users to benchmark more than one installation of Spark within one configuration file.
+
+## spark-args
+
+`spark-args` contains a series of key-value pairs that reflect arguments users would normally set in their spark-submit scripts.
+
+To read more about these options in the context of Spark, see the official documentation: <https://spark.apache.org/docs/latest/submitting-applications.html>
+
+Probably the most important of these is `master`. Just like in a spark-submit script, users can set master to local, an IP address and port for standalone, or yarn or mesos.
+
+```hocon
+spark-args = {
+  master = "local[4]" // or local[*], local[2], etc.
+}
+```
+```hocon
+spark-args = {
+  master = "spark://207.184.161.138:7077" //standalone
+}
+```
+```hocon
+spark-args = {
+  master = "yarn"
+  deploy-mode = "cluster"
+}
+```
+```hocon
+spark-args = {
+  master = "mesos://207.184.161.138:7077"
+}
+```
+
+`master` is the only spark-arg that can also be set in an environment variable. If `SPARK_MASTER_HOST` and `spark-args = { master = ...` 
+are both set, the configuration file option will win.
+
+Other spark args include, but are not limited to, `deploy-mode`, `executor-memory`, `num-executors`, `total-executor-cores`, etc.
+
+## conf
+
+The `conf` parameter contains a series of pairs of strings representing configuration options for Spark. 
+These are things that are prefaced by the `--conf` option. 
+
+Example:
+```hocon
+conf = {
+  "spark.dynamicAllocation.enabled" = "false"
+  "spark.shuffle.service.enabled" = "false"
+}
+``` 
+
+Notice that `conf` is a SERIES of options. To make a list of conf options, users must make a list of objects like so:
+```hocon
+conf = [
+  {
+    "spark.dynamicAllocation.enabled" = "false"
+    "spark.shuffle.service.enabled" = "false"
+  },
+  {
+    "spark.dynamicAllocation.enabled" = "true"
+    "spark.shuffle.service.enabled" = "true"
+  }
+]
+```
+This will create two spark-submit scripts that will have the same workload-suites and other parameters, 
+but the first will have `"spark.dynamicAllocation.enabled" = "false"` and `"spark.shuffle.service.enabled" = "false"`,
+and the second will have `"spark.dynamicAllocation.enabled" = "true"` and `"spark.shuffle.service.enabled" = "true"`.
+
+## suites-parallel
+
+`suites-parallel` controls whether the workload suites within this spark-submit run serially or in parallel. 
+This option defaults to `false` meaning the suites will run serially.
diff --git a/docs/user-guide.md b/docs/user-guide.md
@@ -0,0 +1,15 @@
+---
+layout: page
+title: User's Guide
+permalink: /users-guide/
+---
+
+<ul>
+  {% for page in site.users-guide %}
+    <li>
+      <h3>
+        <a class="page-link" href="{{ page.url | relative_url }}">{{ page.title | escape }}</a>
+      </h3>
+    </li>
+  {% endfor %}
+</ul>
diff --git a/scalastyle-config.xml b/scalastyle-config.xml
@@ -91,11 +91,7 @@
  </check>
  <check level="warning" class="org.scalastyle.scalariform.UppercaseLChecker" enabled="true"></check>
  <check level="warning" class="org.scalastyle.scalariform.SimplifyBooleanExpressionChecker" enabled="true"></check>
- <check level="warning" class="org.scalastyle.scalariform.IfBraceChecker" enabled="true">
-  <parameters>
-   <parameter name="singleLineAllowed"><![CDATA[true]]></parameter>
-   <parameter name="doubleLineAllowed"><![CDATA[false]]></parameter>
-  </parameters>
+ <check level="warning" class="org.scalastyle.scalariform.IfBraceChecker" enabled="false">
  </check>
  <check level="warning" class="org.scalastyle.scalariform.MethodLengthChecker" enabled="true">
   <parameters>

diff --git a/spark-launch/src/main/scala/com/ibm/sparktc/sparkbench/sparklaunch/SparkLaunchConf.scala b/spark-launch/src/main/scala/com/ibm/sparktc/sparkbench/sparklaunch/SparkLaunchConf.scala
@@ -90,7 +90,7 @@ object SparkLaunchConf {
       fileList.filterNot(_ == whereIAm).head
     }
     else if(whereIAm.isEmpty) {
-      throw new SparkBenchException("Could not determine location for necessary spark-bench jars."); null
+      throw SparkBenchException("Could not determine location for necessary spark-bench jars."); null
     }
     else {
       /* Assume here that we're in a testing environment. When `sbt test` runs for spark-launch, it'll

diff --git a/spark-launch/src/main/scala/com/ibm/sparktc/sparkbench/sparklaunch/SparkLaunchDefaults.scala b/spark-launch/src/main/scala/com/ibm/sparktc/sparkbench/sparklaunch/SparkLaunchDefaults.scala
@@ -25,6 +25,7 @@ object SparkLaunchDefaults {
   val sparkConf = "conf"
   val suites = "workload-suites"
   val suitesParallel = "suites-parallel"
+  val sparkHome = "spark-home"
 
   val suitesParallelDefaultValue = false
 }
diff --git a/...unch/src/main/scala/com/ibm/sparktc/sparkbench/sparklaunch/SparkSubmitDeconstructed.scala b/...unch/src/main/scala/com/ibm/sparktc/sparkbench/sparklaunch/SparkSubmitDeconstructed.scala
@@ -88,25 +88,25 @@ case class SparkSubmitDeconstructed (
       else None
     }
 
-    val suitesParallel = optionallyGetFromJavaMap[Boolean](sparkSubmitOptions,SLD.suitesParallel)
-
+    val sparkHome = optionallyGetFromJavaMap[String](sparkSubmitOptions, SLD.sparkHome)
+    val suitesParallel = optionallyGetFromJavaMap[Boolean](sparkSubmitOptions, SLD.suitesParallel)
     val conf: Option[util.Map[String, Any]] = optionallyGetFromJavaMap[util.Map[String, Any]](sparkSubmitOptions, SLD.sparkConf)
 
     val sparkArgs: Option[util.Map[String, Any]] = {
-
       val asScala = sparkSubmitOptions.asScala
-
-
       val filtered = asScala.filterKeys {
         case SLD.sparkConf => false
         case SLD.suitesParallel => false
+        case SLD.sparkHome => false
         case _ => true
       }
+
       if (filtered.isEmpty) None
       else Some(filtered.asJava)
     }
 
     SparkSubmitPieces(
+      sparkHome,
       suitesParallel,
       conf,
       sparkArgs,
@@ -116,6 +116,7 @@ case class SparkSubmitDeconstructed (
 }
 
 case class SparkSubmitPieces (
+                               sparkHome: Option[String],
                                suitesParallel: Option[Boolean],
                                conf: Option[util.Map[String, Any]],
                                sparkArgs: Option[util.Map[String, Any]],
@@ -129,6 +130,7 @@ case class SparkSubmitPieces (
       Try(key -> option.get).toOption
 
     val mostOfIt = Seq(
+      ifItsThere(SLD.sparkHome, sparkHome),
       ifItsThere(SLD.suitesParallel, suitesParallel),
       ifItsThere(SLD.sparkConf, conf),
       ifItsThere(SLD.sparkArgs, sparkArgs)

diff --git a/spark-launch/src/test/resources/etc/specific-spark-home.conf b/spark-launch/src/test/resources/etc/specific-spark-home.conf
@@ -0,0 +1,45 @@
+spark-bench = {
+  spark-submit-config = [{
+    spark-home = "/usr/iop/current/spark2-client/"
+    spark-args = {
+      master = "yarn"
+    }
+    suites-parallel = false
+    workload-suites = [
+      {
+        descr = "Generate a dataset, then take that same dataset and write it out to Parquet format"
+        benchmark-output = "/home/dev-user/emily/results-data-gen.csv"
+        // We need to generate the dataset first through the data generator, then we take that dataset and convert it to Parquet.
+        parallel = false
+        workloads = [
+          {
+            name = "data-generation-kmeans"
+            rows = 100000000
+            cols = 24
+            output = "hdfs:///tmp/kmeans-data.csv"
+          },
+          {
+            name = "sql"
+            queryStr = "select * from input"
+            input = "hdfs:///tmp/kmeans-data.csv"
+            output = "hdfs:///tmp/kmeans-data.parquet"
+          }
+        ]
+      },
+      {
+        descr = "Run two different SQL queries over the dataset in two different formats"
+        benchmark-output = "/home/dev-user/emily/results-sql.csv"
+        parallel = false
+        repeat = 10
+        workloads = [
+          {
+            name = "sql"
+            input = ["hdfs:///tmp/kmeans-data.csv", "hdfs:///tmp/kmeans-data.parquet"]
+            queryStr = ["select * from input", "select `0`, `22` from input where `0` < -0.9"]
+            cache = false
+          }
+        ]
+      }
+    ]
+  }]
+}
diff --git a/spark-launch/src/test/scala/com/ibm/sparktc/sparkbench/sparklaunch/ConfigWranglerTest.scala b/spark-launch/src/test/scala/com/ibm/sparktc/sparkbench/sparklaunch/ConfigWranglerTest.scala
@@ -60,7 +60,6 @@ class ConfigWranglerTest extends FlatSpec with Matchers with BeforeAndAfterEach
     val res1 = seq1.flatMap(ConfigWrangler.processConfig)
     val res2 = seq2.flatMap(ConfigWrangler.processConfig)
 
-    //TODO ugh, this exposed a bug. Well, I guess that's what tests are supposed to do...
     res1.size shouldBe 5 // 4 resulting from crossjoin plus 1 more from other spark-submit-config
     res2.size shouldBe 1
   }

diff --git a/spark-launch/src/test/scala/com/ibm/sparktc/sparkbench/sparklaunch/SparkLaunchConfTest.scala b/spark-launch/src/test/scala/com/ibm/sparktc/sparkbench/sparklaunch/SparkLaunchConfTest.scala
@@ -35,7 +35,9 @@ class SparkLaunchConfTest extends FlatSpec with Matchers with BeforeAndAfter {
     val field = System.getenv().getClass.getDeclaredField("m")
     field.setAccessible(true)
     val map = field.get(System.getenv()).asInstanceOf[java.util.Map[java.lang.String, java.lang.String]]
+    val value = map.get(key)
     map.remove(key)
+    value
   }
 
   before {
@@ -82,4 +84,18 @@ class SparkLaunchConfTest extends FlatSpec with Matchers with BeforeAndAfter {
     SparkLaunch.rmTmpFiles(sparkContextConfs.map(_._2))
   }
 
+  it should "pick up spark-home as set in the config file" in {
+    val oldSparkHome = unsetEnv("SPARK_HOME")
+    val relativePath = "/etc/specific-spark-home.conf"
+    val resource = new File(getClass.getResource(relativePath).toURI)
+    val (sparkContextConfs, _) = SparkLaunch.mkConfs(resource)
+    val conf2 = sparkContextConfs.head._1
+
+    conf2.sparkHome shouldBe "/usr/iop/current/spark2-client/"
+
+    setEnv("SPARK_HOME", oldSparkHome)
+    SparkLaunch.rmTmpFiles(sparkContextConfs.map(_._2))
+
+  }
+
 }