From 609a0fbea3c7b9d9dafcb9523eee7fb9f647d83e Mon Sep 17 00:00:00 2001 From: Michael Brachmann Date: Wed, 2 Oct 2019 12:21:03 -0400 Subject: [PATCH 1/2] minor updates to enable building for scala 2.12 --- build.sbt | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/build.sbt b/build.sbt index c01d73a..5d9e666 100644 --- a/build.sbt +++ b/build.sbt @@ -2,11 +2,11 @@ name := "spark-google-spreadsheets" organization := "com.github.potix2" -scalaVersion := "2.11.12" +scalaVersion := "2.12.10" -crossScalaVersions := Seq("2.11.12") +crossScalaVersions := Seq("2.12.10") -version := "0.6.4-SNAPSHOT" +version := "0.6.4" spName := "potix2/spark-google-spreadsheets" @@ -16,7 +16,7 @@ spIncludeMaven := true spIgnoreProvided := true -sparkVersion := "2.3.3" +sparkVersion := "2.4.4" val testSparkVersion = settingKey[String]("The version of Spark to test against.") @@ -26,7 +26,7 @@ sparkComponents := Seq("sql") libraryDependencies ++= Seq( "org.slf4j" % "slf4j-api" % "1.7.5" % "provided", - "org.scalatest" %% "scalatest" % "2.2.1" % "test", + "org.scalatest" %% "scalatest" % "3.0.8" % "test", ("com.google.api-client" % "google-api-client" % "1.22.0"). exclude("com.google.guava", "guava-jdk5"), "com.google.oauth-client" % "google-oauth-client-jetty" % "1.22.0", @@ -55,6 +55,9 @@ publishArtifact in Test := false pomIncludeRepository := { _ => false } +//publishMavenStyle := true +//publishTo := Some(Resolver.file("file", new File(Path.userHome.absolutePath+"/.m2/repository"))) + publishTo := { val nexus = "https://oss.sonatype.org/" if (version.value.endsWith("SNAPSHOT")) From 7c1f310c6f9e98741385bcbf9b7313b0716bf2e4 Mon Sep 17 00:00:00 2001 From: Michael Brachmann Date: Tue, 26 Nov 2019 11:00:31 -0500 Subject: [PATCH 2/2] Make user provided schema work and sanitize inferred schema field names --- .../spreadsheets/SpreadsheetRelation.scala | 20 ++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/src/main/scala/com/github/potix2/spark/google/spreadsheets/SpreadsheetRelation.scala b/src/main/scala/com/github/potix2/spark/google/spreadsheets/SpreadsheetRelation.scala index db19bf3..e7b1a99 100644 --- a/src/main/scala/com/github/potix2/spark/google/spreadsheets/SpreadsheetRelation.scala +++ b/src/main/scala/com/github/potix2/spark/google/spreadsheets/SpreadsheetRelation.scala @@ -29,6 +29,7 @@ case class SpreadsheetRelation protected[spark] ( import com.github.potix2.spark.google.spreadsheets.SparkSpreadsheetService._ + private val fieldMap = scala.collection.mutable.Map[String, String]() override def schema: StructType = userSchema.getOrElse(inferSchema()) private lazy val aWorksheet: SparkWorksheet = @@ -47,6 +48,7 @@ case class SpreadsheetRelation protected[spark] ( override def buildScan(): RDD[Row] = { val aSchema = schema + val schemaMap = fieldMap.toMap sqlContext.sparkContext.makeRDD(rows).mapPartitions { iter => iter.map { m => var index = 0 @@ -55,6 +57,8 @@ case class SpreadsheetRelation protected[spark] ( val field = aSchema.fields(index) rowArray(index) = if (m.contains(field.name)) { TypeCast.castTo(m(field.name), field.dataType, field.nullable) + } else if (schemaMap.contains(field.name) && m.contains(schemaMap(field.name))) { + TypeCast.castTo(m(schemaMap(field.name)), field.dataType, field.nullable) } else { null } @@ -78,9 +82,19 @@ case class SpreadsheetRelation protected[spark] ( } } + def sanitizeColumnName(name: String): String = + { + name + .replaceAll("[^a-zA-Z0-9]+", "_") // Replace sequences of non-alphanumeric characters with underscores + .replaceAll("_+$", "") // Strip trailing underscores + .replaceAll("^[0-9_]+", "") // Strip leading underscores and digits + } + private def inferSchema(): StructType = - StructType(aWorksheet.headers.toList.map { fieldName => - StructField(fieldName, StringType, nullable = true) - }) + StructType(aWorksheet.headers.toList.map { fieldName => { + val sanitizedName = sanitizeColumnName(fieldName) + fieldMap.put(sanitizedName, fieldName) + StructField(sanitizedName, StringType, true) + }}) }