Skip to content

Commit

Permalink
Workaround for Spark ORC column names
Browse files Browse the repository at this point in the history
A bug in the catalyst parser manifests with purely numeric column names.
This gets around the issue.
  • Loading branch information
ecurtin authored and Emily Curtin committed Feb 16, 2018
1 parent a1975e5 commit 40f60dc
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ case class KMeansDataGen(
}

val (convertTime, dataDF) = time {
val schemaString = data.first().indices.map(_.toString).mkString(" ")
val schemaString = data.first().indices.map(i => "c" + i.toString).mkString(" ")
val fields = schemaString.split(" ").map(fieldName => StructField(fieldName, DoubleType, nullable = false))
val schema = StructType(fields)
val rowRDD = data.map(arr => Row(arr:_*))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,40 +21,41 @@ import java.io.File

import com.ibm.sparktc.sparkbench.datageneration.mlgenerator.KMeansDataGen
import com.ibm.sparktc.sparkbench.testfixtures.{BuildAndTeardownData, SparkSessionProvider}
import org.scalatest.{BeforeAndAfterEach, FlatSpec, Matchers}
import org.scalatest.{BeforeAndAfterAll, FlatSpec, Matchers}

import scala.io.Source

class KMeansDataGenTest extends FlatSpec with Matchers with BeforeAndAfterEach {
class KMeansDataGenTest extends FlatSpec with Matchers with BeforeAndAfterAll {
val cool = new BuildAndTeardownData("kmeans-data-gen")

val fileName = s"${cool.sparkBenchTestFolder}/${java.util.UUID.randomUUID.toString}.csv"
val fileName = s"${cool.sparkBenchTestFolder}/${java.util.UUID.randomUUID.toString}"

var file: File = _

override def beforeEach() {
override def beforeAll() {
cool.createFolders()
file = new File(fileName)
}

override def afterEach() {
override def afterAll() {
cool.deleteFolders()
}

"KMeansDataGeneration" should "generate data correctly" in {
"KMeansDataGeneration" should "generate a csv correctly" in {

val csvFile = s"$fileName.csv"

val m = Map(
"name" -> "kmeans",
"rows" -> 10,
"cols" -> 10,
"output" -> fileName
"output" -> csvFile
)

val generator = KMeansDataGen(m)


generator.doWorkload(spark = SparkSessionProvider.spark)

file = new File(csvFile)

val fileList = file.listFiles().toList.filter(_.getName.startsWith("part"))

Expand All @@ -74,4 +75,34 @@ class KMeansDataGenTest extends FlatSpec with Matchers with BeforeAndAfterEach {
*/
length shouldBe generator.numRows + fileList.length
}

it should "generate an ORC file correctly" in {
val spark = SparkSessionProvider.spark

val orcFile = s"$fileName.orc"

val m = Map(
"name" -> "kmeans",
"rows" -> 10,
"cols" -> 10,
"output" -> orcFile
)

val generator = KMeansDataGen(m)

generator.doWorkload(spark = spark)

file = new File(orcFile)

val list = file.listFiles().toList
val fileList = list.filter(_.getName.startsWith("part"))

fileList.length should be > 0

println(s"reading file $orcFile")

val fromDisk = spark.read.orc(orcFile)
val rows = fromDisk.count()
rows shouldBe 10
}
}

0 comments on commit 40f60dc

Please sign in to comment.