diff --git a/.github/workflows/pyspark.yml b/.github/workflows/pyspark.yml
index 702122c4e..8630a1ce9 100644
--- a/.github/workflows/pyspark.yml
+++ b/.github/workflows/pyspark.yml
@@ -50,7 +50,7 @@ jobs:
- name: Install Python
uses: actions/setup-python@v4
with:
- python-version: 3.9
+ python-version: '3.10'
- name: Install Poetry
working-directory: pyspark
diff --git a/.github/workflows/spark.yaml b/.github/workflows/spark.yaml
index 3d7dc96f4..cae0fc2dd 100644
--- a/.github/workflows/spark.yaml
+++ b/.github/workflows/spark.yaml
@@ -47,11 +47,17 @@ jobs:
matrix:
include:
- mvn-profile: "datasources-32"
- spark: "spark-3.2.2"
- spark-hadoop: "spark-3.2.2-bin-hadoop3.2"
+ spark: "spark-3.2.4"
+ spark-hadoop: "spark-3.2.4-bin-hadoop3.2"
- mvn-profile: "datasources-33"
spark: "spark-3.3.4"
spark-hadoop: "spark-3.3.4-bin-hadoop3"
+ - mvn-profile: "datasources-34"
+ spark: "spark-3.4.3"
+ spark-hadoop: "spark-3.4.3-bin-hadoop3"
+ - mvn-profile: "datasources-35"
+ spark: "spark-3.5.1"
+ spark-hadoop: "spark-3.5.1-bin-hadoop3"
steps:
- uses: actions/checkout@v4
@@ -117,7 +123,10 @@ jobs:
echo "match (a) -[r] -> () delete a, r;match (a) delete a;" | cypher-shell -u ${NEO4J_USR} -p ${NEO4J_PWD} -d neo4j --format plain
scripts/run-graphar2neo4j.sh
+ # Apache Spark version 3.4.3 is not supported by the current NebulaGraph Spark Connector.
- name: Run Nebula2GraphAr example
+ # https://github.com/orgs/community/discussions/37883#discussioncomment-4021318
+ if: ${{ matrix.spark < 'spark-3.4.3' }}
working-directory: maven-projects/spark
run: |
export JAVA_HOME=${JAVA_HOME_11_X64}
diff --git a/.gitignore b/.gitignore
index 0d520ae44..08a7fbcac 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,6 +4,11 @@
.DS_store
.cache
.ccls-cache
+.dir-locals.el
+.classpath
+.project
+.settings
+.factorypath
compile_commands.json
diff --git a/licenserc.toml b/licenserc.toml
index ed4a4c141..56db55c85 100644
--- a/licenserc.toml
+++ b/licenserc.toml
@@ -48,6 +48,10 @@ excludes = [
"spark/datasources-32/src/main/scala/org/apache/spark/sql/graphar",
"spark/datasources-33/src/main/scala/org/apache/graphar/datasources",
"spark/datasources-33/src/main/scala/org/apache/spark/sql/graphar",
+ "spark/datasources-34/src/main/scala/org/apache/graphar/datasources",
+ "spark/datasources-34/src/main/scala/org/apache/spark/sql/graphar",
+ "spark/datasources-35/src/main/scala/org/apache/graphar/datasources",
+ "spark/datasources-35/src/main/scala/org/apache/spark/sql/graphar",
"java/src/main/java/org/apache/graphar/stdcxx/StdString.java",
"java/src/main/java/org/apache/graphar/stdcxx/StdVector.java",
"java/src/main/java/org/apache/graphar/stdcxx/StdSharedPtr.java",
diff --git a/maven-projects/spark/datasources-32/pom.xml b/maven-projects/spark/datasources-32/pom.xml
index 265eb4c67..9bde15e4d 100644
--- a/maven-projects/spark/datasources-32/pom.xml
+++ b/maven-projects/spark/datasources-32/pom.xml
@@ -77,7 +77,7 @@
${scala.version}
- -target:jvm-1.8
+ -target:jvm-${maven.compiler.target}
-Xss4096K
@@ -128,8 +128,8 @@
org.scalameta
- semanticdb-scalac_2.12.10
- 4.3.24
+ semanticdb-scalac_${scala.version}
+ ${semanticdb-scalac.version}
diff --git a/maven-projects/spark/datasources-33/pom.xml b/maven-projects/spark/datasources-33/pom.xml
index 265eb4c67..9bde15e4d 100644
--- a/maven-projects/spark/datasources-33/pom.xml
+++ b/maven-projects/spark/datasources-33/pom.xml
@@ -77,7 +77,7 @@
${scala.version}
- -target:jvm-1.8
+ -target:jvm-${maven.compiler.target}
-Xss4096K
@@ -128,8 +128,8 @@
org.scalameta
- semanticdb-scalac_2.12.10
- 4.3.24
+ semanticdb-scalac_${scala.version}
+ ${semanticdb-scalac.version}
diff --git a/maven-projects/spark/datasources-33/src/main/scala/org/apache/spark/sql/graphar/GarScan.scala b/maven-projects/spark/datasources-33/src/main/scala/org/apache/spark/sql/graphar/GarScan.scala
index a4d5207b7..e93027634 100644
--- a/maven-projects/spark/datasources-33/src/main/scala/org/apache/spark/sql/graphar/GarScan.scala
+++ b/maven-projects/spark/datasources-33/src/main/scala/org/apache/spark/sql/graphar/GarScan.scala
@@ -212,7 +212,8 @@ case class GarScan(
val parsedOptions = new JSONOptionsInRead(
CaseInsensitiveMap(options.asScala.toMap),
sparkSession.sessionState.conf.sessionLocalTimeZone,
- sparkSession.sessionState.conf.columnNameOfCorruptRecord)
+ sparkSession.sessionState.conf.columnNameOfCorruptRecord
+ )
// Check a field requirement for corrupt records here to throw an exception in a driver side
ExprUtils.verifyColumnNameOfCorruptRecord(
diff --git a/maven-projects/spark/datasources-33/src/main/scala/org/apache/spark/sql/graphar/GarScanBuilder.scala b/maven-projects/spark/datasources-33/src/main/scala/org/apache/spark/sql/graphar/GarScanBuilder.scala
index 3b2ca60ee..23b51b6e9 100644
--- a/maven-projects/spark/datasources-33/src/main/scala/org/apache/spark/sql/graphar/GarScanBuilder.scala
+++ b/maven-projects/spark/datasources-33/src/main/scala/org/apache/spark/sql/graphar/GarScanBuilder.scala
@@ -52,7 +52,7 @@ case class GarScanBuilder(
this.filters = dataFilters
formatName match {
case "csv" => Array.empty[Filter]
- case "json" => Array.empty[Filter]
+ case "json" => Array.empty[Filter]
case "orc" => pushedOrcFilters
case "parquet" => pushedParquetFilters
case _ =>
@@ -84,9 +84,9 @@ case class GarScanBuilder(
// Check if the file format supports nested schema pruning.
override protected val supportsNestedSchemaPruning: Boolean =
formatName match {
- case "csv" => false
+ case "csv" => false
case "json" => false
- case "orc" => sparkSession.sessionState.conf.nestedSchemaPruningEnabled
+ case "orc" => sparkSession.sessionState.conf.nestedSchemaPruningEnabled
case "parquet" =>
sparkSession.sessionState.conf.nestedSchemaPruningEnabled
case _ =>
diff --git a/maven-projects/spark/datasources-33/src/main/scala/org/apache/spark/sql/graphar/GarTable.scala b/maven-projects/spark/datasources-33/src/main/scala/org/apache/spark/sql/graphar/GarTable.scala
index e24e9051b..df874ea32 100644
--- a/maven-projects/spark/datasources-33/src/main/scala/org/apache/spark/sql/graphar/GarTable.scala
+++ b/maven-projects/spark/datasources-33/src/main/scala/org/apache/spark/sql/graphar/GarTable.scala
@@ -86,20 +86,20 @@ case class GarTable(
case "parquet" =>
ParquetUtils.inferSchema(sparkSession, options.asScala.toMap, files)
case "json" => {
- val parsedOptions = new JSONOptions(
- options.asScala.toMap,
- sparkSession.sessionState.conf.sessionLocalTimeZone
- )
-
- JsonDataSource(parsedOptions).inferSchema(
- sparkSession,
- files,
- parsedOptions
- )
+ val parsedOptions = new JSONOptions(
+ options.asScala.toMap,
+ sparkSession.sessionState.conf.sessionLocalTimeZone
+ )
+
+ JsonDataSource(parsedOptions).inferSchema(
+ sparkSession,
+ files,
+ parsedOptions
+ )
}
case _ =>
throw new IllegalArgumentException("Invalid format name: " + formatName)
-
+
}
/** Construct a new write builder according to the actual file format. */
diff --git a/maven-projects/spark/datasources-34/.scalafmt.conf b/maven-projects/spark/datasources-34/.scalafmt.conf
new file mode 120000
index 000000000..4cb05e831
--- /dev/null
+++ b/maven-projects/spark/datasources-34/.scalafmt.conf
@@ -0,0 +1 @@
+../.scalafmt.conf
\ No newline at end of file
diff --git a/maven-projects/spark/datasources-34/pom.xml b/maven-projects/spark/datasources-34/pom.xml
new file mode 100644
index 000000000..9bde15e4d
--- /dev/null
+++ b/maven-projects/spark/datasources-34/pom.xml
@@ -0,0 +1,193 @@
+
+
+
+
+ 4.0.0
+
+
+ org.apache.graphar
+ spark
+ ${graphar.version}
+ ../pom.xml
+
+
+ graphar-datasources
+ ${graphar.version}
+ jar
+
+
+
+ org.apache.spark
+ spark-core_${scala.binary.version}
+ ${spark.version}
+ provided
+
+
+ org.apache.spark
+ spark-streaming_${scala.binary.version}
+ ${spark.version}
+ provided
+
+
+ org.apache.spark
+ spark-mllib_${scala.binary.version}
+ ${spark.version}
+ provided
+
+
+ org.apache.spark
+ spark-sql_${scala.binary.version}
+ ${spark.version}
+ provided
+
+
+ org.apache.spark
+ spark-hive_${scala.binary.version}
+ ${spark.version}
+ provided
+
+
+
+
+
+
+ org.scala-tools
+ maven-scala-plugin
+ 2.15.2
+
+ ${scala.version}
+
+ -target:jvm-${maven.compiler.target}
+
+
+ -Xss4096K
+
+
+
+
+ scala-compile
+
+ compile
+
+
+
+ META-INF/*.SF
+ META-INF/*.DSA
+ META-INF/*.RSA
+
+
+
+
+ scala-test-compile
+
+ testCompile
+
+
+
+
+
+ net.alchim31.maven
+ scala-maven-plugin
+ 4.8.0
+
+
+
+ compile
+ testCompile
+
+
+
+
+
+ -Xms64m
+ -Xmx1024m
+
+
+ -Ywarn-unused
+
+
+
+ org.scalameta
+ semanticdb-scalac_${scala.version}
+ ${semanticdb-scalac.version}
+
+
+
+
+
+ com.diffplug.spotless
+ spotless-maven-plugin
+ 2.20.0
+
+
+
+
+
+
+ 1.13.0
+
+
+
+
+
+ ${project.basedir}/.scalafmt.conf
+
+
+
+
+
+ io.github.evis
+ scalafix-maven-plugin_2.13
+ 0.1.8_0.11.0
+
+
+ org.apache.maven.plugins
+ maven-source-plugin
+
+
+ attach-sources
+
+ jar
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-javadoc-plugin
+
+
+ attach-javadocs
+
+ jar
+
+
+
+
+
+ maven-site-plugin
+ 3.7.1
+
+
+
+
diff --git a/maven-projects/spark/datasources-34/src/main/java/org/apache/graphar/GeneralParams.java b/maven-projects/spark/datasources-34/src/main/java/org/apache/graphar/GeneralParams.java
new file mode 120000
index 000000000..a3915d619
--- /dev/null
+++ b/maven-projects/spark/datasources-34/src/main/java/org/apache/graphar/GeneralParams.java
@@ -0,0 +1 @@
+../../../../../../../graphar/src/main/java/org/apache/graphar/GeneralParams.java
\ No newline at end of file
diff --git a/maven-projects/spark/datasources-34/src/main/scala/org/apache/graphar/datasources/GarDataSource.scala b/maven-projects/spark/datasources-34/src/main/scala/org/apache/graphar/datasources/GarDataSource.scala
new file mode 100644
index 000000000..e502f82c6
--- /dev/null
+++ b/maven-projects/spark/datasources-34/src/main/scala/org/apache/graphar/datasources/GarDataSource.scala
@@ -0,0 +1,180 @@
+/* Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.graphar.datasources
+
+import scala.collection.JavaConverters._
+import scala.util.matching.Regex
+import java.util
+import com.fasterxml.jackson.databind.ObjectMapper
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.Path
+import org.apache.spark.sql.connector.catalog.{Table, TableProvider}
+import org.apache.spark.sql.execution.datasources._
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.execution.datasources.csv.CSVFileFormat
+import org.apache.spark.sql.execution.datasources.orc.OrcFileFormat
+import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat
+import org.apache.spark.sql.execution.datasources.json.JsonFileFormat
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.util.CaseInsensitiveStringMap
+import org.apache.spark.sql.sources.DataSourceRegister
+import org.apache.spark.sql.connector.expressions.Transform
+import org.apache.spark.sql.graphar.GarTable
+
+// Derived from Apache Spark 3.1.1
+// https://github.com/apache/spark/blob/1d550c4/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileDataSourceV2.scala
+
+/**
+ * GarDataSource is a class to provide gar files as the data source for spark.
+ */
+class GarDataSource extends TableProvider with DataSourceRegister {
+ private val REDACTION_REPLACEMENT_TEXT = "*********(redacted)"
+
+ /**
+ * Redact the sensitive information in the given string.
+ */
+ // Copy of redact from graphar Utils
+ private def redact(regex: Option[Regex], text: String): String = {
+ regex match {
+ case None => text
+ case Some(r) =>
+ if (text == null || text.isEmpty) {
+ text
+ } else {
+ r.replaceAllIn(text, REDACTION_REPLACEMENT_TEXT)
+ }
+ }
+ }
+
+ /** The default fallback file format is Parquet. */
+ def fallbackFileFormat: Class[_ <: FileFormat] = classOf[ParquetFileFormat]
+
+ lazy val sparkSession = SparkSession.active
+
+ /** The string that represents the format name. */
+ override def shortName(): String = "gar"
+
+ protected def getPaths(map: CaseInsensitiveStringMap): Seq[String] = {
+ val objectMapper = new ObjectMapper()
+ val paths = Option(map.get("paths"))
+ .map { pathStr =>
+ objectMapper.readValue(pathStr, classOf[Array[String]]).toSeq
+ }
+ .getOrElse(Seq.empty)
+ paths ++ Option(map.get("path")).toSeq
+ }
+
+ protected def getOptionsWithoutPaths(
+ map: CaseInsensitiveStringMap
+ ): CaseInsensitiveStringMap = {
+ val withoutPath = map.asCaseSensitiveMap().asScala.filterKeys { k =>
+ !k.equalsIgnoreCase("path") && !k.equalsIgnoreCase("paths")
+ }
+ new CaseInsensitiveStringMap(withoutPath.toMap.asJava)
+ }
+
+ protected def getTableName(
+ map: CaseInsensitiveStringMap,
+ paths: Seq[String]
+ ): String = {
+ val hadoopConf = sparkSession.sessionState.newHadoopConfWithOptions(
+ map.asCaseSensitiveMap().asScala.toMap
+ )
+ val name = shortName() + " " + paths
+ .map(qualifiedPathName(_, hadoopConf))
+ .mkString(",")
+ redact(sparkSession.sessionState.conf.stringRedactionPattern, name)
+ }
+
+ private def qualifiedPathName(
+ path: String,
+ hadoopConf: Configuration
+ ): String = {
+ val hdfsPath = new Path(path)
+ val fs = hdfsPath.getFileSystem(hadoopConf)
+ hdfsPath.makeQualified(fs.getUri, fs.getWorkingDirectory).toString
+ }
+
+ /** Provide a table from the data source. */
+ def getTable(options: CaseInsensitiveStringMap): Table = {
+ val paths = getPaths(options)
+ val tableName = getTableName(options, paths)
+ val optionsWithoutPaths = getOptionsWithoutPaths(options)
+ GarTable(
+ tableName,
+ sparkSession,
+ optionsWithoutPaths,
+ paths,
+ None,
+ getFallbackFileFormat(options)
+ )
+ }
+
+ /** Provide a table from the data source with specific schema. */
+ def getTable(options: CaseInsensitiveStringMap, schema: StructType): Table = {
+ val paths = getPaths(options)
+ val tableName = getTableName(options, paths)
+ val optionsWithoutPaths = getOptionsWithoutPaths(options)
+ GarTable(
+ tableName,
+ sparkSession,
+ optionsWithoutPaths,
+ paths,
+ Some(schema),
+ getFallbackFileFormat(options)
+ )
+ }
+
+ override def supportsExternalMetadata(): Boolean = true
+
+ private var t: Table = null
+
+ override def inferSchema(options: CaseInsensitiveStringMap): StructType = {
+ if (t == null) t = getTable(options)
+ t.schema()
+ }
+
+ override def inferPartitioning(
+ options: CaseInsensitiveStringMap
+ ): Array[Transform] = {
+ Array.empty
+ }
+
+ override def getTable(
+ schema: StructType,
+ partitioning: Array[Transform],
+ properties: util.Map[String, String]
+ ): Table = {
+ // If the table is already loaded during schema inference, return it directly.
+ if (t != null) {
+ t
+ } else {
+ getTable(new CaseInsensitiveStringMap(properties), schema)
+ }
+ }
+
+ // Get the actual fall back file format.
+ private def getFallbackFileFormat(
+ options: CaseInsensitiveStringMap
+ ): Class[_ <: FileFormat] = options.get("fileFormat") match {
+ case "csv" => classOf[CSVFileFormat]
+ case "orc" => classOf[OrcFileFormat]
+ case "parquet" => classOf[ParquetFileFormat]
+ case "json" => classOf[JsonFileFormat]
+ case _ => throw new IllegalArgumentException
+ }
+}
diff --git a/maven-projects/spark/datasources-34/src/main/scala/org/apache/spark/sql/graphar/GarCommitProtocol.scala b/maven-projects/spark/datasources-34/src/main/scala/org/apache/spark/sql/graphar/GarCommitProtocol.scala
new file mode 100644
index 000000000..c6ca79c21
--- /dev/null
+++ b/maven-projects/spark/datasources-34/src/main/scala/org/apache/spark/sql/graphar/GarCommitProtocol.scala
@@ -0,0 +1,97 @@
+/* Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Derived from Apache Spark 3.3.4
+// https://github.com/apache/spark/blob/18db204/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
+
+package org.apache.spark.sql.graphar
+
+import org.apache.graphar.GeneralParams
+
+import org.json4s._
+import org.json4s.jackson.JsonMethods._
+
+import org.apache.spark.sql.execution.datasources.SQLHadoopMapReduceCommitProtocol
+import org.apache.hadoop.mapreduce._
+import org.apache.spark.internal.Logging
+import org.apache.spark.internal.io.FileNameSpec
+
+object GarCommitProtocol {
+ private def binarySearchPair(aggNums: Array[Int], key: Int): (Int, Int) = {
+ var low = 0
+ var high = aggNums.length - 1
+ var mid = 0
+ while (low <= high) {
+ mid = (high + low) / 2;
+ if (
+ aggNums(mid) <= key && (mid == aggNums.length - 1 || aggNums(
+ mid + 1
+ ) > key)
+ ) {
+ return (mid, key - aggNums(mid))
+ } else if (aggNums(mid) > key) {
+ high = mid - 1
+ } else {
+ low = mid + 1
+ }
+ }
+ return (low, key - aggNums(low))
+ }
+}
+
+class GarCommitProtocol(
+ jobId: String,
+ path: String,
+ options: Map[String, String],
+ dynamicPartitionOverwrite: Boolean = false
+) extends SQLHadoopMapReduceCommitProtocol(
+ jobId,
+ path,
+ dynamicPartitionOverwrite
+ )
+ with Serializable
+ with Logging {
+
+ // override getFilename to customize the file name
+ override def getFilename(
+ taskContext: TaskAttemptContext,
+ spec: FileNameSpec
+ ): String = {
+ val partitionId = taskContext.getTaskAttemptID.getTaskID.getId
+ if (options.contains(GeneralParams.offsetStartChunkIndexKey)) {
+ // offset chunk file name, looks like chunk0
+ val chunk_index =
+ options(GeneralParams.offsetStartChunkIndexKey).toInt + partitionId
+ return f"chunk$chunk_index"
+ }
+ if (options.contains(GeneralParams.aggNumListOfEdgeChunkKey)) {
+ // edge chunk file name, looks like part0/chunk0
+ val jValue = parse(
+ options(GeneralParams.aggNumListOfEdgeChunkKey)
+ )
+ implicit val formats =
+ DefaultFormats // initialize a default formats for json4s
+ val aggNums: Array[Int] = Extraction.extract[Array[Int]](jValue)
+ val chunkPair: (Int, Int) =
+ GarCommitProtocol.binarySearchPair(aggNums, partitionId)
+ val vertex_chunk_index: Int = chunkPair._1
+ val edge_chunk_index: Int = chunkPair._2
+ return f"part$vertex_chunk_index/chunk$edge_chunk_index"
+ }
+ // vertex chunk file name, looks like chunk0
+ return f"chunk$partitionId"
+ }
+}
diff --git a/maven-projects/spark/datasources-34/src/main/scala/org/apache/spark/sql/graphar/GarScan.scala b/maven-projects/spark/datasources-34/src/main/scala/org/apache/spark/sql/graphar/GarScan.scala
new file mode 100644
index 000000000..cde86e5d3
--- /dev/null
+++ b/maven-projects/spark/datasources-34/src/main/scala/org/apache/spark/sql/graphar/GarScan.scala
@@ -0,0 +1,362 @@
+/* Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Derived from Apache Spark 3.3.4
+// https://github.com/apache/spark/blob/18db204/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileScan.scala
+
+package org.apache.spark.sql.graphar
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.Path
+import org.apache.parquet.hadoop.ParquetInputFormat
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.catalyst.csv.CSVOptions
+import org.apache.spark.sql.catalyst.json.JSONOptionsInRead
+import org.apache.spark.sql.catalyst.expressions.{ExprUtils, Expression}
+import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
+import org.apache.spark.sql.connector.read.PartitionReaderFactory
+import org.apache.spark.sql.execution.PartitionedFileUtil
+import org.apache.spark.sql.execution.datasources.{
+ FilePartition,
+ PartitionedFile,
+ PartitioningAwareFileIndex
+}
+import org.apache.spark.sql.execution.datasources.parquet.{
+ ParquetOptions,
+ ParquetReadSupport,
+ ParquetWriteSupport
+}
+import org.apache.spark.sql.execution.datasources.v2.FileScan
+import org.apache.spark.sql.execution.datasources.v2.csv.CSVPartitionReaderFactory
+import org.apache.spark.sql.execution.datasources.v2.json.JsonPartitionReaderFactory
+import org.apache.spark.sql.execution.datasources.v2.orc.OrcPartitionReaderFactory
+import org.apache.spark.sql.execution.datasources.orc.OrcOptions
+import org.apache.spark.sql.execution.datasources.v2.parquet.ParquetPartitionReaderFactory
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.sources.Filter
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.util.CaseInsensitiveStringMap
+import org.apache.spark.util.SerializableConfiguration
+
+import scala.collection.mutable.ArrayBuffer
+import scala.jdk.CollectionConverters._
+import org.apache.spark.memory.MemoryMode
+
+/** GarScan is a class to implement the file scan for GarDataSource. */
+case class GarScan(
+ sparkSession: SparkSession,
+ hadoopConf: Configuration,
+ fileIndex: PartitioningAwareFileIndex,
+ dataSchema: StructType,
+ readDataSchema: StructType,
+ readPartitionSchema: StructType,
+ pushedFilters: Array[Filter],
+ options: CaseInsensitiveStringMap,
+ formatName: String,
+ partitionFilters: Seq[Expression] = Seq.empty,
+ dataFilters: Seq[Expression] = Seq.empty
+) extends FileScan {
+
+ /** The gar format is not splitable. */
+ override def isSplitable(path: Path): Boolean = false
+
+ /** Create the reader factory according to the actual file format. */
+ override def createReaderFactory(): PartitionReaderFactory =
+ formatName match {
+ case "csv" => createCSVReaderFactory()
+ case "orc" => createOrcReaderFactory()
+ case "parquet" => createParquetReaderFactory()
+ case "json" => createJSONReaderFactory()
+ case _ =>
+ throw new IllegalArgumentException("Invalid format name: " + formatName)
+ }
+
+ // Create the reader factory for the CSV format.
+ private def createCSVReaderFactory(): PartitionReaderFactory = {
+ val columnPruning = sparkSession.sessionState.conf.csvColumnPruning &&
+ !readDataSchema.exists(
+ _.name == sparkSession.sessionState.conf.columnNameOfCorruptRecord
+ )
+
+ val parsedOptions: CSVOptions = new CSVOptions(
+ options.asScala.toMap,
+ columnPruning = columnPruning,
+ sparkSession.sessionState.conf.sessionLocalTimeZone,
+ sparkSession.sessionState.conf.columnNameOfCorruptRecord
+ )
+
+ // Check a field requirement for corrupt records here to throw an exception in a driver side
+ ExprUtils.verifyColumnNameOfCorruptRecord(
+ dataSchema,
+ parsedOptions.columnNameOfCorruptRecord
+ )
+ // Don't push any filter which refers to the "virtual" column which cannot present in the input.
+ // Such filters will be applied later on the upper layer.
+ val actualFilters =
+ pushedFilters.filterNot(
+ _.references.contains(parsedOptions.columnNameOfCorruptRecord)
+ )
+
+ val caseSensitiveMap = options.asCaseSensitiveMap.asScala.toMap
+ // Hadoop Configurations are case sensitive.
+ val hadoopConf =
+ sparkSession.sessionState.newHadoopConfWithOptions(caseSensitiveMap)
+ val broadcastedConf = sparkSession.sparkContext.broadcast(
+ new SerializableConfiguration(hadoopConf)
+ )
+ // The partition values are already truncated in `FileScan.partitions`.
+ // We should use `readPartitionSchema` as the partition schema here.
+ CSVPartitionReaderFactory(
+ sparkSession.sessionState.conf,
+ broadcastedConf,
+ dataSchema,
+ readDataSchema,
+ readPartitionSchema,
+ parsedOptions,
+ actualFilters
+ )
+ }
+
+ // Create the reader factory for the Orc format.
+ private def createOrcReaderFactory(): PartitionReaderFactory = {
+ val broadcastedConf = sparkSession.sparkContext.broadcast(
+ new SerializableConfiguration(hadoopConf)
+ )
+ // The partition values are already truncated in `FileScan.partitions`.
+ // We should use `readPartitionSchema` as the partition schema here.
+ OrcPartitionReaderFactory(
+ sqlConf = sparkSession.sessionState.conf,
+ broadcastedConf = broadcastedConf,
+ dataSchema = dataSchema,
+ readDataSchema = readDataSchema,
+ partitionSchema = readPartitionSchema,
+ filters = pushedFilters,
+ aggregation = None,
+ options = new OrcOptions(
+ Map.empty[String, String],
+ sparkSession.sessionState.conf
+ ),
+ memoryMode = MemoryMode.ON_HEAP
+ )
+ }
+
+ // Create the reader factory for the Parquet format.
+ private def createParquetReaderFactory(): PartitionReaderFactory = {
+ val readDataSchemaAsJson = readDataSchema.json
+ hadoopConf.set(
+ ParquetInputFormat.READ_SUPPORT_CLASS,
+ classOf[ParquetReadSupport].getName
+ )
+ hadoopConf.set(
+ ParquetReadSupport.SPARK_ROW_REQUESTED_SCHEMA,
+ readDataSchemaAsJson
+ )
+ hadoopConf.set(ParquetWriteSupport.SPARK_ROW_SCHEMA, readDataSchemaAsJson)
+ hadoopConf.set(
+ SQLConf.SESSION_LOCAL_TIMEZONE.key,
+ sparkSession.sessionState.conf.sessionLocalTimeZone
+ )
+ hadoopConf.setBoolean(
+ SQLConf.NESTED_SCHEMA_PRUNING_ENABLED.key,
+ sparkSession.sessionState.conf.nestedSchemaPruningEnabled
+ )
+ hadoopConf.setBoolean(
+ SQLConf.CASE_SENSITIVE.key,
+ sparkSession.sessionState.conf.caseSensitiveAnalysis
+ )
+
+ ParquetWriteSupport.setSchema(readDataSchema, hadoopConf)
+
+ // Sets flags for `ParquetToSparkSchemaConverter`
+ hadoopConf.setBoolean(
+ SQLConf.PARQUET_BINARY_AS_STRING.key,
+ sparkSession.sessionState.conf.isParquetBinaryAsString
+ )
+ hadoopConf.setBoolean(
+ SQLConf.PARQUET_INT96_AS_TIMESTAMP.key,
+ sparkSession.sessionState.conf.isParquetINT96AsTimestamp
+ )
+ hadoopConf.setBoolean(
+ SQLConf.LEGACY_PARQUET_NANOS_AS_LONG.key,
+ sparkSession.sessionState.conf.legacyParquetNanosAsLong
+ )
+ hadoopConf.setBoolean(
+ SQLConf.PARQUET_FIELD_ID_READ_ENABLED.key,
+ sparkSession.sessionState.conf.parquetFieldIdReadEnabled
+ )
+ hadoopConf.setBoolean(
+ SQLConf.PARQUET_INFER_TIMESTAMP_NTZ_ENABLED.key,
+ sparkSession.sessionState.conf.parquetInferTimestampNTZEnabled
+ )
+
+ val broadcastedConf = sparkSession.sparkContext.broadcast(
+ new SerializableConfiguration(hadoopConf)
+ )
+ val sqlConf = sparkSession.sessionState.conf
+ ParquetPartitionReaderFactory(
+ sqlConf = sqlConf,
+ broadcastedConf = broadcastedConf,
+ dataSchema = dataSchema,
+ readDataSchema = readDataSchema,
+ partitionSchema = readPartitionSchema,
+ filters = pushedFilters,
+ aggregation = None,
+ new ParquetOptions(options.asCaseSensitiveMap.asScala.toMap, sqlConf)
+ )
+ }
+
+ // Create the reader factory for the JSON format.
+ private def createJSONReaderFactory(): PartitionReaderFactory = {
+ val parsedOptions = new JSONOptionsInRead(
+ CaseInsensitiveMap(options.asScala.toMap),
+ sparkSession.sessionState.conf.sessionLocalTimeZone,
+ sparkSession.sessionState.conf.columnNameOfCorruptRecord
+ )
+
+ // Check a field requirement for corrupt records here to throw an exception in a driver side
+ ExprUtils.verifyColumnNameOfCorruptRecord(
+ dataSchema,
+ parsedOptions.columnNameOfCorruptRecord
+ )
+ // Don't push any filter which refers to the "virtual" column which cannot present in the input.
+ // Such filters will be applied later on the upper layer.
+ val actualFilters =
+ pushedFilters.filterNot(
+ _.references.contains(parsedOptions.columnNameOfCorruptRecord)
+ )
+
+ val caseSensitiveMap = options.asCaseSensitiveMap.asScala.toMap
+ // Hadoop Configurations are case sensitive.
+ val hadoopConf =
+ sparkSession.sessionState.newHadoopConfWithOptions(caseSensitiveMap)
+ val broadcastedConf = sparkSession.sparkContext.broadcast(
+ new SerializableConfiguration(hadoopConf)
+ )
+ // The partition values are already truncated in `FileScan.partitions`.
+ // We should use `readPartitionSchema` as the partition schema here.
+ JsonPartitionReaderFactory(
+ sparkSession.sessionState.conf,
+ broadcastedConf,
+ dataSchema,
+ readDataSchema,
+ readPartitionSchema,
+ parsedOptions,
+ actualFilters
+ )
+ }
+
+ /**
+ * Override "partitions" of
+ * org.apache.spark.sql.execution.datasources.v2.FileScan to disable splitting
+ * and sort the files by file paths instead of by file sizes. Note: This
+ * implementation does not support to partition attributes.
+ */
+ override protected def partitions: Seq[FilePartition] = {
+ val selectedPartitions = fileIndex.listFiles(partitionFilters, dataFilters)
+ val maxSplitBytes =
+ FilePartition.maxSplitBytes(sparkSession, selectedPartitions)
+
+ val splitFiles = selectedPartitions.flatMap { partition =>
+ val partitionValues = partition.values
+ partition.files
+ .flatMap { file =>
+ val filePath = file.getPath
+ PartitionedFileUtil.splitFiles(
+ sparkSession = sparkSession,
+ file = file,
+ filePath = filePath,
+ isSplitable = isSplitable(filePath),
+ maxSplitBytes = maxSplitBytes,
+ partitionValues = partitionValues
+ )
+ }
+ .toArray
+ .sortBy(_.filePath.toPath)
+ // starting from 3.4 PartitionedFile.filePath is SparkPath, not String
+ }
+
+ getFilePartitions(sparkSession, splitFiles)
+ }
+
+ /**
+ * Override "getFilePartitions" of
+ * org.apache.spark.sql.execution.datasources.FilePartition to assign each
+ * chunk file in GraphAr to a single partition.
+ */
+ private def getFilePartitions(
+ sparkSession: SparkSession,
+ partitionedFiles: Seq[PartitionedFile]
+ ): Seq[FilePartition] = {
+ val partitions = new ArrayBuffer[FilePartition]
+ val currentFiles = new ArrayBuffer[PartitionedFile]
+
+ /** Close the current partition and move to the next. */
+ def closePartition(): Unit = {
+ if (currentFiles.nonEmpty) {
+ // Copy to a new Array.
+ val newPartition = FilePartition(partitions.size, currentFiles.toArray)
+ partitions += newPartition
+ }
+ currentFiles.clear()
+ }
+ // Assign a file to each partition
+ partitionedFiles.foreach { file =>
+ closePartition()
+ // Add the given file to the current partition.
+ currentFiles += file
+ }
+ closePartition()
+ partitions.toSeq
+ }
+
+ /** Check if two objects are equal. */
+ override def equals(obj: Any): Boolean = obj match {
+ case g: GarScan =>
+ super.equals(g) && dataSchema == g.dataSchema && options == g.options &&
+ equivalentFilters(
+ pushedFilters,
+ g.pushedFilters
+ ) && formatName == g.formatName
+ case _ => false
+ }
+
+ /** Get the hash code of the object. */
+ override def hashCode(): Int = formatName match {
+ case "csv" => super.hashCode()
+ case "json" => super.hashCode()
+ case "orc" => getClass.hashCode()
+ case "parquet" => getClass.hashCode()
+ case _ =>
+ throw new IllegalArgumentException("Invalid format name: " + formatName)
+ }
+
+ /** Get the description string of the object. */
+ override def description(): String = {
+ super.description() + ", PushedFilters: " + seqToString(pushedFilters)
+ }
+
+ /** Get the meta data map of the object. */
+ override def getMetaData(): Map[String, String] = {
+ super.getMetaData() ++ Map("PushedFilters" -> seqToString(pushedFilters))
+ }
+
+ /** Construct the file scan with filters. */
+ def withFilters(
+ partitionFilters: Seq[Expression],
+ dataFilters: Seq[Expression]
+ ): FileScan =
+ this.copy(partitionFilters = partitionFilters, dataFilters = dataFilters)
+}
diff --git a/maven-projects/spark/datasources-34/src/main/scala/org/apache/spark/sql/graphar/GarScanBuilder.scala b/maven-projects/spark/datasources-34/src/main/scala/org/apache/spark/sql/graphar/GarScanBuilder.scala
new file mode 100644
index 000000000..706b72ae3
--- /dev/null
+++ b/maven-projects/spark/datasources-34/src/main/scala/org/apache/spark/sql/graphar/GarScanBuilder.scala
@@ -0,0 +1,109 @@
+/* Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Derived from Apache Spark 3.3.4
+// https://github.com/apache/spark/blob/18db204/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileScanBuilder.scala
+
+package org.apache.spark.sql.graphar
+
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.connector.read.Scan
+import org.apache.spark.sql.execution.datasources.PartitioningAwareFileIndex
+import org.apache.spark.sql.execution.datasources.v2.FileScanBuilder
+import org.apache.spark.sql.execution.datasources.v2.orc.OrcScanBuilder
+import org.apache.spark.sql.execution.datasources.v2.parquet.ParquetScanBuilder
+import org.apache.spark.sql.sources.Filter
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.util.CaseInsensitiveStringMap
+
+import scala.collection.JavaConverters._
+
+/** GarScanBuilder is a class to build the file scan for GarDataSource. */
+case class GarScanBuilder(
+ sparkSession: SparkSession,
+ fileIndex: PartitioningAwareFileIndex,
+ schema: StructType,
+ dataSchema: StructType,
+ options: CaseInsensitiveStringMap,
+ formatName: String
+) extends FileScanBuilder(sparkSession, fileIndex, dataSchema) {
+ lazy val hadoopConf = {
+ val caseSensitiveMap = options.asCaseSensitiveMap.asScala.toMap
+ // Hadoop Configurations are case sensitive.
+ sparkSession.sessionState.newHadoopConfWithOptions(caseSensitiveMap)
+ }
+
+ private var filters: Array[Filter] = Array.empty
+
+ override def pushDataFilters(dataFilters: Array[Filter]): Array[Filter] = {
+ this.filters = dataFilters
+ formatName match {
+ case "csv" => Array.empty[Filter]
+ case "json" => Array.empty[Filter]
+ case "orc" => pushedOrcFilters
+ case "parquet" => pushedParquetFilters
+ case _ =>
+ throw new IllegalArgumentException("Invalid format name: " + formatName)
+ }
+ }
+
+ private lazy val pushedParquetFilters: Array[Filter] = {
+ if (!sparkSession.sessionState.conf.parquetFilterPushDown) {
+ Array.empty[Filter]
+ } else {
+ val builder =
+ ParquetScanBuilder(sparkSession, fileIndex, schema, dataSchema, options)
+ builder.pushDataFilters(this.filters)
+ }
+ }
+
+ private lazy val pushedOrcFilters: Array[Filter] = {
+ if (!sparkSession.sessionState.conf.orcFilterPushDown) {
+ Array.empty[Filter]
+ } else {
+ val builder =
+ OrcScanBuilder(sparkSession, fileIndex, schema, dataSchema, options)
+ builder.pushDataFilters(this.filters)
+ }
+ }
+
+ // Check if the file format supports nested schema pruning.
+ override protected val supportsNestedSchemaPruning: Boolean =
+ formatName match {
+ case "csv" => false
+ case "json" => false
+ case "orc" => sparkSession.sessionState.conf.nestedSchemaPruningEnabled
+ case "parquet" =>
+ sparkSession.sessionState.conf.nestedSchemaPruningEnabled
+ case _ =>
+ throw new IllegalArgumentException("Invalid format name: " + formatName)
+ }
+
+ /** Build the file scan for GarDataSource. */
+ override def build(): Scan = {
+ GarScan(
+ sparkSession,
+ hadoopConf,
+ fileIndex,
+ dataSchema,
+ readDataSchema(),
+ readPartitionSchema(),
+ pushedDataFilters,
+ options,
+ formatName
+ )
+ }
+}
diff --git a/maven-projects/spark/datasources-34/src/main/scala/org/apache/spark/sql/graphar/GarTable.scala b/maven-projects/spark/datasources-34/src/main/scala/org/apache/spark/sql/graphar/GarTable.scala
new file mode 100644
index 000000000..df874ea32
--- /dev/null
+++ b/maven-projects/spark/datasources-34/src/main/scala/org/apache/spark/sql/graphar/GarTable.scala
@@ -0,0 +1,150 @@
+/* Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Derived from Apache Spark 3.1.1
+// https://github.com/apache/spark/blob/1d550c4/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileTable.scala
+
+package org.apache.spark.sql.graphar
+
+import org.apache.hadoop.fs.FileStatus
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.catalyst.csv.CSVOptions
+import org.apache.spark.sql.connector.write.{LogicalWriteInfo, WriteBuilder}
+import org.apache.spark.sql.execution.datasources.FileFormat
+import org.apache.spark.sql.execution.datasources.csv.CSVDataSource
+import org.apache.spark.sql.execution.datasources.orc.OrcUtils
+import org.apache.spark.sql.execution.datasources.parquet.ParquetUtils
+import org.apache.spark.sql.execution.datasources.v2.FileTable
+import org.apache.spark.sql.graphar.csv.CSVWriteBuilder
+import org.apache.spark.sql.graphar.orc.OrcWriteBuilder
+import org.apache.spark.sql.graphar.parquet.ParquetWriteBuilder
+import org.apache.spark.sql.graphar.json.JSONWriteBuilder
+import org.apache.spark.sql.types._
+import org.apache.spark.sql.util.CaseInsensitiveStringMap
+import org.apache.spark.sql.execution.datasources.json.JsonDataSource
+import org.apache.spark.sql.catalyst.json.JSONOptions
+
+import scala.collection.JavaConverters._
+
+/** GarTable is a class to represent the graph data in GraphAr as a table. */
+case class GarTable(
+ name: String,
+ sparkSession: SparkSession,
+ options: CaseInsensitiveStringMap,
+ paths: Seq[String],
+ userSpecifiedSchema: Option[StructType],
+ fallbackFileFormat: Class[_ <: FileFormat]
+) extends FileTable(sparkSession, options, paths, userSpecifiedSchema) {
+
+ /** Construct a new scan builder. */
+ override def newScanBuilder(
+ options: CaseInsensitiveStringMap
+ ): GarScanBuilder =
+ new GarScanBuilder(
+ sparkSession,
+ fileIndex,
+ schema,
+ dataSchema,
+ options,
+ formatName
+ )
+
+ /**
+ * Infer the schema of the table through the methods of the actual file
+ * format.
+ */
+ override def inferSchema(files: Seq[FileStatus]): Option[StructType] =
+ formatName match {
+ case "csv" => {
+ val parsedOptions = new CSVOptions(
+ options.asScala.toMap,
+ columnPruning = sparkSession.sessionState.conf.csvColumnPruning,
+ sparkSession.sessionState.conf.sessionLocalTimeZone
+ )
+
+ CSVDataSource(parsedOptions).inferSchema(
+ sparkSession,
+ files,
+ parsedOptions
+ )
+ }
+ case "orc" =>
+ OrcUtils.inferSchema(sparkSession, files, options.asScala.toMap)
+ case "parquet" =>
+ ParquetUtils.inferSchema(sparkSession, options.asScala.toMap, files)
+ case "json" => {
+ val parsedOptions = new JSONOptions(
+ options.asScala.toMap,
+ sparkSession.sessionState.conf.sessionLocalTimeZone
+ )
+
+ JsonDataSource(parsedOptions).inferSchema(
+ sparkSession,
+ files,
+ parsedOptions
+ )
+ }
+ case _ =>
+ throw new IllegalArgumentException("Invalid format name: " + formatName)
+
+ }
+
+ /** Construct a new write builder according to the actual file format. */
+ override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder =
+ formatName match {
+ case "csv" =>
+ new CSVWriteBuilder(paths, formatName, supportsDataType, info)
+ case "orc" =>
+ new OrcWriteBuilder(paths, formatName, supportsDataType, info)
+ case "parquet" =>
+ new ParquetWriteBuilder(paths, formatName, supportsDataType, info)
+ case "json" =>
+ new JSONWriteBuilder(paths, formatName, supportsDataType, info)
+ case _ =>
+ throw new IllegalArgumentException("Invalid format name: " + formatName)
+ }
+
+ /**
+ * Check if a data type is supported. Note: Currently, the GraphAr data source
+ * only supports several atomic data types. To support additional data types
+ * such as Struct, Array and Map, revise this function to handle them case by
+ * case as the commented code shows.
+ */
+ override def supportsDataType(dataType: DataType): Boolean = dataType match {
+ // case _: AnsiIntervalType => false
+
+ case _: AtomicType => true
+
+ // case st: StructType => st.forall { f => supportsDataType(f.dataType) }
+
+ case ArrayType(elementType, _) =>
+ formatName match {
+ case "orc" => supportsDataType(elementType)
+ case "parquet" => supportsDataType(elementType)
+ case _ => false
+ }
+
+ // case MapType(keyType, valueType, _) =>
+ // supportsDataType(keyType) && supportsDataType(valueType)
+
+ // case udt: UserDefinedType[_] => supportsDataType(udt.sqlType)
+
+ case _ => false
+ }
+
+ /** The actual file format for storing the data in GraphAr. */
+ override def formatName: String = options.get("fileFormat")
+}
diff --git a/maven-projects/spark/datasources-34/src/main/scala/org/apache/spark/sql/graphar/GarWriteBuilder.scala b/maven-projects/spark/datasources-34/src/main/scala/org/apache/spark/sql/graphar/GarWriteBuilder.scala
new file mode 100644
index 000000000..58f1890da
--- /dev/null
+++ b/maven-projects/spark/datasources-34/src/main/scala/org/apache/spark/sql/graphar/GarWriteBuilder.scala
@@ -0,0 +1,176 @@
+/* Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Derived from Apache Spark 3.3.4
+// https://github.com/apache/spark/blob/18db204/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileWriteBuilder.scala
+
+package org.apache.spark.sql.graphar
+
+import java.util.UUID
+
+import scala.collection.JavaConverters._
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.Path
+import org.apache.hadoop.mapreduce.Job
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
+
+import org.apache.spark.sql.execution.datasources.OutputWriterFactory
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, DateTimeUtils}
+import org.apache.spark.sql.connector.write.{
+ BatchWrite,
+ LogicalWriteInfo,
+ WriteBuilder
+}
+import org.apache.spark.sql.execution.datasources.{
+ BasicWriteJobStatsTracker,
+ DataSource,
+ WriteJobDescription
+}
+import org.apache.spark.sql.execution.metric.SQLMetric
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.types.{DataType, StructType}
+import org.apache.spark.util.SerializableConfiguration
+import org.apache.spark.sql.execution.datasources.v2.FileBatchWrite
+import org.apache.spark.sql.catalyst.expressions.AttributeReference
+
+abstract class GarWriteBuilder(
+ paths: Seq[String],
+ formatName: String,
+ supportsDataType: DataType => Boolean,
+ info: LogicalWriteInfo
+) extends WriteBuilder {
+ private val schema = info.schema()
+ private val queryId = info.queryId()
+ private val options = info.options()
+
+ override def buildForBatch(): BatchWrite = {
+ val sparkSession = SparkSession.active
+ validateInputs(sparkSession.sessionState.conf.caseSensitiveAnalysis)
+ val path = new Path(paths.head)
+ val caseSensitiveMap = options.asCaseSensitiveMap.asScala.toMap
+ // Hadoop Configurations are case sensitive.
+ val hadoopConf =
+ sparkSession.sessionState.newHadoopConfWithOptions(caseSensitiveMap)
+ val job = getJobInstance(hadoopConf, path)
+ val committer = new GarCommitProtocol(
+ java.util.UUID.randomUUID().toString,
+ paths.head,
+ options.asScala.toMap,
+ false
+ )
+ lazy val description =
+ createWriteJobDescription(
+ sparkSession,
+ hadoopConf,
+ job,
+ paths.head,
+ options.asScala.toMap
+ )
+
+ committer.setupJob(job)
+ new FileBatchWrite(job, description, committer)
+ }
+
+ def prepareWrite(
+ sqlConf: SQLConf,
+ job: Job,
+ options: Map[String, String],
+ dataSchema: StructType
+ ): OutputWriterFactory
+
+ private def validateInputs(caseSensitiveAnalysis: Boolean): Unit = {
+ assert(schema != null, "Missing input data schema")
+ assert(queryId != null, "Missing query ID")
+
+ if (paths.length != 1) {
+ throw new IllegalArgumentException(
+ "Expected exactly one path to be specified, but " +
+ s"got: ${paths.mkString(", ")}"
+ )
+ }
+ val pathName = paths.head
+ val sqlConf = SparkSession.active.sessionState.conf
+ DataSource.validateSchema(schema, sqlConf)
+
+ schema.foreach { field =>
+ if (!supportsDataType(field.dataType)) {
+ throw new IllegalArgumentException(
+ s"$formatName data source does not support ${field.dataType.catalogString} data type."
+ )
+ }
+ }
+ }
+
+ private def getJobInstance(hadoopConf: Configuration, path: Path): Job = {
+ val job = Job.getInstance(hadoopConf)
+ job.setOutputKeyClass(classOf[Void])
+ job.setOutputValueClass(classOf[InternalRow])
+ FileOutputFormat.setOutputPath(job, path)
+ job
+ }
+
+ private def createWriteJobDescription(
+ sparkSession: SparkSession,
+ hadoopConf: Configuration,
+ job: Job,
+ pathName: String,
+ options: Map[String, String]
+ ): WriteJobDescription = {
+ val caseInsensitiveOptions = CaseInsensitiveMap(options)
+ // Note: prepareWrite has side effect. It sets "job".
+ val outputWriterFactory =
+ prepareWrite(
+ sparkSession.sessionState.conf,
+ job,
+ caseInsensitiveOptions,
+ schema
+ )
+ // same as schema.toAttributes which is private of spark package
+ val allColumns: Seq[AttributeReference] = schema.map(f =>
+ AttributeReference(f.name, f.dataType, f.nullable, f.metadata)()
+ )
+ val metrics: Map[String, SQLMetric] = BasicWriteJobStatsTracker.metrics
+ val serializableHadoopConf = new SerializableConfiguration(hadoopConf)
+ val statsTracker =
+ new BasicWriteJobStatsTracker(serializableHadoopConf, metrics)
+ // TODO: after partitioning is supported in V2:
+ // 1. filter out partition columns in `dataColumns`.
+ // 2. Don't use Seq.empty for `partitionColumns`.
+ new WriteJobDescription(
+ uuid = UUID.randomUUID().toString,
+ serializableHadoopConf =
+ new SerializableConfiguration(job.getConfiguration),
+ outputWriterFactory = outputWriterFactory,
+ allColumns = allColumns,
+ dataColumns = allColumns,
+ partitionColumns = Seq.empty,
+ bucketSpec = None,
+ path = pathName,
+ customPartitionLocations = Map.empty,
+ maxRecordsPerFile = caseInsensitiveOptions
+ .get("maxRecordsPerFile")
+ .map(_.toLong)
+ .getOrElse(sparkSession.sessionState.conf.maxRecordsPerFile),
+ timeZoneId = caseInsensitiveOptions
+ .get(DateTimeUtils.TIMEZONE_OPTION)
+ .getOrElse(sparkSession.sessionState.conf.sessionLocalTimeZone),
+ statsTrackers = Seq(statsTracker)
+ )
+ }
+}
diff --git a/maven-projects/spark/datasources-34/src/main/scala/org/apache/spark/sql/graphar/csv/CSVWriteBuilder.scala b/maven-projects/spark/datasources-34/src/main/scala/org/apache/spark/sql/graphar/csv/CSVWriteBuilder.scala
new file mode 100644
index 000000000..68e156e07
--- /dev/null
+++ b/maven-projects/spark/datasources-34/src/main/scala/org/apache/spark/sql/graphar/csv/CSVWriteBuilder.scala
@@ -0,0 +1,72 @@
+/* Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Derived from Apache Spark 3.1.1
+// https://github.com/apache/spark/blob/1d550c4/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/csv/CSVWriteBuilder.scala
+
+package org.apache.spark.sql.graphar.csv
+
+import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext}
+import org.apache.spark.sql.catalyst.csv.CSVOptions
+import org.apache.spark.sql.catalyst.util.CompressionCodecs
+import org.apache.spark.sql.connector.write.LogicalWriteInfo
+import org.apache.spark.sql.execution.datasources.{
+ CodecStreams,
+ OutputWriter,
+ OutputWriterFactory
+}
+import org.apache.spark.sql.execution.datasources.csv.CsvOutputWriter
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.types.{DataType, StructType}
+import org.apache.spark.sql.graphar.GarWriteBuilder
+
+class CSVWriteBuilder(
+ paths: Seq[String],
+ formatName: String,
+ supportsDataType: DataType => Boolean,
+ info: LogicalWriteInfo
+) extends GarWriteBuilder(paths, formatName, supportsDataType, info) {
+ override def prepareWrite(
+ sqlConf: SQLConf,
+ job: Job,
+ options: Map[String, String],
+ dataSchema: StructType
+ ): OutputWriterFactory = {
+ val conf = job.getConfiguration
+ val csvOptions = new CSVOptions(
+ options,
+ columnPruning = sqlConf.csvColumnPruning,
+ sqlConf.sessionLocalTimeZone
+ )
+ csvOptions.compressionCodec.foreach { codec =>
+ CompressionCodecs.setCodecConfiguration(conf, codec)
+ }
+
+ new OutputWriterFactory {
+ override def newInstance(
+ path: String,
+ dataSchema: StructType,
+ context: TaskAttemptContext
+ ): OutputWriter = {
+ new CsvOutputWriter(path, dataSchema, context, csvOptions)
+ }
+
+ override def getFileExtension(context: TaskAttemptContext): String = {
+ ".csv" + CodecStreams.getCompressionExtension(context)
+ }
+ }
+ }
+}
diff --git a/maven-projects/spark/datasources-34/src/main/scala/org/apache/spark/sql/graphar/json/JSONWriteBuilder.scala b/maven-projects/spark/datasources-34/src/main/scala/org/apache/spark/sql/graphar/json/JSONWriteBuilder.scala
new file mode 100644
index 000000000..150a9a9f8
--- /dev/null
+++ b/maven-projects/spark/datasources-34/src/main/scala/org/apache/spark/sql/graphar/json/JSONWriteBuilder.scala
@@ -0,0 +1,73 @@
+/* Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Derived from Apache Spark 3.5.1
+// https://github.com/apache/spark/blob/1d550c4/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/json/JsonWriteBuilder.scala
+
+package org.apache.spark.sql.graphar.json
+import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext}
+import org.apache.spark.sql.catalyst.util.CompressionCodecs
+import org.apache.spark.sql.connector.write.LogicalWriteInfo
+import org.apache.spark.sql.execution.datasources.json.JsonOutputWriter
+import org.apache.spark.sql.execution.datasources.{
+ CodecStreams,
+ OutputWriter,
+ OutputWriterFactory
+}
+
+import org.apache.spark.sql.catalyst.json.JSONOptions
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.types.{StructType, DataType}
+
+import org.apache.spark.sql.graphar.GarWriteBuilder
+
+class JSONWriteBuilder(
+ paths: Seq[String],
+ formatName: String,
+ supportsDataType: DataType => Boolean,
+ info: LogicalWriteInfo
+) extends GarWriteBuilder(paths, formatName, supportsDataType, info) {
+ override def prepareWrite(
+ sqlConf: SQLConf,
+ job: Job,
+ options: Map[String, String],
+ dataSchema: StructType
+ ): OutputWriterFactory = {
+ val conf = job.getConfiguration
+ val parsedOptions = new JSONOptions(
+ options,
+ sqlConf.sessionLocalTimeZone,
+ sqlConf.columnNameOfCorruptRecord
+ )
+ parsedOptions.compressionCodec.foreach { codec =>
+ CompressionCodecs.setCodecConfiguration(conf, codec)
+ }
+
+ new OutputWriterFactory {
+ override def newInstance(
+ path: String,
+ dataSchema: StructType,
+ context: TaskAttemptContext
+ ): OutputWriter = {
+ new JsonOutputWriter(path, parsedOptions, dataSchema, context)
+ }
+
+ override def getFileExtension(context: TaskAttemptContext): String = {
+ ".json" + CodecStreams.getCompressionExtension(context)
+ }
+ }
+ }
+}
diff --git a/maven-projects/spark/datasources-34/src/main/scala/org/apache/spark/sql/graphar/orc/OrcOutputWriter.scala b/maven-projects/spark/datasources-34/src/main/scala/org/apache/spark/sql/graphar/orc/OrcOutputWriter.scala
new file mode 100644
index 000000000..ccc7a48e1
--- /dev/null
+++ b/maven-projects/spark/datasources-34/src/main/scala/org/apache/spark/sql/graphar/orc/OrcOutputWriter.scala
@@ -0,0 +1,70 @@
+/* Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Derived from Apache Spark 3.1.1, since the OrcOutputWriter is private in the original source,
+// we have to reimplement it here.
+// https://github.com/apache/spark/blob/1d550c4/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcOutputWriter.scala
+
+package org.apache.spark.sql.graphar.orc
+
+import org.apache.hadoop.fs.Path
+import org.apache.hadoop.io.NullWritable
+import org.apache.hadoop.mapreduce.TaskAttemptContext
+import org.apache.orc.OrcFile
+import org.apache.orc.mapred.{
+ OrcOutputFormat => OrcMapRedOutputFormat,
+ OrcStruct
+}
+import org.apache.orc.mapreduce.{OrcMapreduceRecordWriter, OrcOutputFormat}
+
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.execution.datasources.OutputWriter
+import org.apache.spark.sql.execution.datasources.orc.{OrcSerializer, OrcUtils}
+import org.apache.spark.sql.types._
+
+class OrcOutputWriter(
+ val path: String,
+ dataSchema: StructType,
+ context: TaskAttemptContext
+) extends OutputWriter {
+
+ private[this] val serializer = new OrcSerializer(dataSchema)
+
+ private val recordWriter = {
+ val orcOutputFormat = new OrcOutputFormat[OrcStruct]() {
+ override def getDefaultWorkFile(
+ context: TaskAttemptContext,
+ extension: String
+ ): Path = {
+ new Path(path)
+ }
+ }
+ val filename = orcOutputFormat.getDefaultWorkFile(context, ".orc")
+ val options = OrcMapRedOutputFormat.buildOptions(context.getConfiguration)
+ val writer = OrcFile.createWriter(filename, options)
+ val recordWriter = new OrcMapreduceRecordWriter[OrcStruct](writer)
+ OrcUtils.addSparkVersionMetadata(writer)
+ recordWriter
+ }
+
+ override def write(row: InternalRow): Unit = {
+ recordWriter.write(NullWritable.get(), serializer.serialize(row))
+ }
+
+ override def close(): Unit = {
+ recordWriter.close(context)
+ }
+}
diff --git a/maven-projects/spark/datasources-34/src/main/scala/org/apache/spark/sql/graphar/orc/OrcWriteBuilder.scala b/maven-projects/spark/datasources-34/src/main/scala/org/apache/spark/sql/graphar/orc/OrcWriteBuilder.scala
new file mode 100644
index 000000000..287162f8e
--- /dev/null
+++ b/maven-projects/spark/datasources-34/src/main/scala/org/apache/spark/sql/graphar/orc/OrcWriteBuilder.scala
@@ -0,0 +1,104 @@
+/* Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Derived from Apache Spark 3.1.1
+// https://github.com/apache/spark/blob/1d550c4/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/orc/ORCWriteBuilder.scala
+
+package org.apache.spark.sql.graphar.orc
+
+import org.apache.hadoop.mapred.JobConf
+import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext}
+import org.apache.orc.OrcConf.{COMPRESS, MAPRED_OUTPUT_SCHEMA}
+import org.apache.orc.mapred.OrcStruct
+
+import org.apache.spark.sql.connector.write.LogicalWriteInfo
+import org.apache.spark.sql.execution.datasources.{
+ OutputWriter,
+ OutputWriterFactory
+}
+import org.apache.spark.sql.execution.datasources.orc.{OrcOptions, OrcUtils}
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.types._
+
+import org.apache.spark.sql.graphar.GarWriteBuilder
+
+object OrcWriteBuilder {
+ // the getQuotedSchemaString method of spark OrcFileFormat
+ private def getQuotedSchemaString(dataType: DataType): String =
+ dataType match {
+ case StructType(fields) =>
+ fields
+ .map(f => s"`${f.name}`:${getQuotedSchemaString(f.dataType)}")
+ .mkString("struct<", ",", ">")
+ case ArrayType(elementType, _) =>
+ s"array<${getQuotedSchemaString(elementType)}>"
+ case MapType(keyType, valueType, _) =>
+ s"map<${getQuotedSchemaString(keyType)},${getQuotedSchemaString(valueType)}>"
+ case _ => // UDT and others
+ dataType.catalogString
+ }
+}
+
+class OrcWriteBuilder(
+ paths: Seq[String],
+ formatName: String,
+ supportsDataType: DataType => Boolean,
+ info: LogicalWriteInfo
+) extends GarWriteBuilder(paths, formatName, supportsDataType, info) {
+
+ override def prepareWrite(
+ sqlConf: SQLConf,
+ job: Job,
+ options: Map[String, String],
+ dataSchema: StructType
+ ): OutputWriterFactory = {
+ val orcOptions = new OrcOptions(options, sqlConf)
+
+ val conf = job.getConfiguration
+
+ conf.set(
+ MAPRED_OUTPUT_SCHEMA.getAttribute,
+ OrcWriteBuilder.getQuotedSchemaString(dataSchema)
+ )
+
+ conf.set(COMPRESS.getAttribute, orcOptions.compressionCodec)
+
+ conf
+ .asInstanceOf[JobConf]
+ .setOutputFormat(
+ classOf[org.apache.orc.mapred.OrcOutputFormat[OrcStruct]]
+ )
+
+ new OutputWriterFactory {
+ override def newInstance(
+ path: String,
+ dataSchema: StructType,
+ context: TaskAttemptContext
+ ): OutputWriter = {
+ new OrcOutputWriter(path, dataSchema, context)
+ }
+
+ override def getFileExtension(context: TaskAttemptContext): String = {
+ val compressionExtension: String = {
+ val name = context.getConfiguration.get(COMPRESS.getAttribute)
+ OrcUtils.extensionsForCompressionCodecNames.getOrElse(name, "")
+ }
+
+ compressionExtension + ".orc"
+ }
+ }
+ }
+}
diff --git a/maven-projects/spark/datasources-34/src/main/scala/org/apache/spark/sql/graphar/parquet/ParquetWriteBuilder.scala b/maven-projects/spark/datasources-34/src/main/scala/org/apache/spark/sql/graphar/parquet/ParquetWriteBuilder.scala
new file mode 100644
index 000000000..8e53dc5f8
--- /dev/null
+++ b/maven-projects/spark/datasources-34/src/main/scala/org/apache/spark/sql/graphar/parquet/ParquetWriteBuilder.scala
@@ -0,0 +1,152 @@
+/* Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Derived from Apache Spark 3.1.1
+// https://github.com/apache/spark/blob/1d550c4/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/parquet/ParquetWriteBuilder.scala
+
+package org.apache.spark.sql.graphar.parquet
+
+import org.apache.hadoop.mapreduce.{Job, OutputCommitter, TaskAttemptContext}
+import org.apache.parquet.hadoop.{ParquetOutputCommitter, ParquetOutputFormat}
+import org.apache.parquet.hadoop.ParquetOutputFormat.JobSummaryLevel
+import org.apache.parquet.hadoop.codec.CodecConfig
+import org.apache.parquet.hadoop.util.ContextUtil
+
+import org.apache.spark.internal.Logging
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.connector.write.LogicalWriteInfo
+import org.apache.spark.sql.execution.datasources.{
+ OutputWriter,
+ OutputWriterFactory
+}
+import org.apache.spark.sql.execution.datasources.parquet._
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.types._
+
+import org.apache.spark.sql.graphar.GarWriteBuilder
+
+class ParquetWriteBuilder(
+ paths: Seq[String],
+ formatName: String,
+ supportsDataType: DataType => Boolean,
+ info: LogicalWriteInfo
+) extends GarWriteBuilder(paths, formatName, supportsDataType, info)
+ with Logging {
+
+ override def prepareWrite(
+ sqlConf: SQLConf,
+ job: Job,
+ options: Map[String, String],
+ dataSchema: StructType
+ ): OutputWriterFactory = {
+ val parquetOptions = new ParquetOptions(options, sqlConf)
+
+ val conf = ContextUtil.getConfiguration(job)
+
+ val committerClass =
+ conf.getClass(
+ SQLConf.PARQUET_OUTPUT_COMMITTER_CLASS.key,
+ classOf[ParquetOutputCommitter],
+ classOf[OutputCommitter]
+ )
+
+ if (conf.get(SQLConf.PARQUET_OUTPUT_COMMITTER_CLASS.key) == null) {
+ logInfo(
+ "Using default output committer for Parquet: " +
+ classOf[ParquetOutputCommitter].getCanonicalName
+ )
+ } else {
+ logInfo(
+ "Using user defined output committer for Parquet: " + committerClass.getCanonicalName
+ )
+ }
+
+ conf.setClass(
+ SQLConf.OUTPUT_COMMITTER_CLASS.key,
+ committerClass,
+ classOf[OutputCommitter]
+ )
+
+ // We're not really using `ParquetOutputFormat[Row]` for writing data here, because we override
+ // it in `ParquetOutputWriter` to support appending and dynamic partitioning. The reason why
+ // we set it here is to setup the output committer class to `ParquetOutputCommitter`, which is
+ // bundled with `ParquetOutputFormat[Row]`.
+ job.setOutputFormatClass(classOf[ParquetOutputFormat[Row]])
+
+ ParquetOutputFormat.setWriteSupportClass(job, classOf[ParquetWriteSupport])
+
+ // This metadata is useful for keeping UDTs like Vector/Matrix.
+ ParquetWriteSupport.setSchema(dataSchema, conf)
+
+ // Sets flags for `ParquetWriteSupport`, which converts Catalyst schema to Parquet
+ // schema and writes actual rows to Parquet files.
+ conf.set(
+ SQLConf.PARQUET_WRITE_LEGACY_FORMAT.key,
+ sqlConf.writeLegacyParquetFormat.toString
+ )
+
+ conf.set(
+ SQLConf.PARQUET_OUTPUT_TIMESTAMP_TYPE.key,
+ sqlConf.parquetOutputTimestampType.toString
+ )
+
+ // Sets compression scheme
+ conf.set(
+ ParquetOutputFormat.COMPRESSION,
+ parquetOptions.compressionCodecClassName
+ )
+
+ // ParquetOutputWriter required fields starting from 3.3.x
+ conf.set(
+ SQLConf.PARQUET_FIELD_ID_WRITE_ENABLED.key,
+ sqlConf.parquetFieldIdWriteEnabled.toString
+ )
+
+ // SPARK-15719: Disables writing Parquet summary files by default.
+ if (
+ conf.get(ParquetOutputFormat.JOB_SUMMARY_LEVEL) == null
+ && conf.get(ParquetOutputFormat.ENABLE_JOB_SUMMARY) == null
+ ) {
+ conf.setEnum(ParquetOutputFormat.JOB_SUMMARY_LEVEL, JobSummaryLevel.NONE)
+ }
+
+ if (
+ ParquetOutputFormat.getJobSummaryLevel(conf) == JobSummaryLevel.NONE
+ && !classOf[ParquetOutputCommitter].isAssignableFrom(committerClass)
+ ) {
+ // output summary is requested, but the class is not a Parquet Committer
+ logWarning(
+ s"Committer $committerClass is not a ParquetOutputCommitter and cannot" +
+ s" create job summaries. " +
+ s"Set Parquet option ${ParquetOutputFormat.JOB_SUMMARY_LEVEL} to NONE."
+ )
+ }
+
+ new OutputWriterFactory {
+ override def newInstance(
+ path: String,
+ dataSchema: StructType,
+ context: TaskAttemptContext
+ ): OutputWriter = {
+ new ParquetOutputWriter(path, context)
+ }
+
+ override def getFileExtension(context: TaskAttemptContext): String = {
+ CodecConfig.from(context).getCodec.getExtension + ".parquet"
+ }
+ }
+ }
+}
diff --git a/maven-projects/spark/datasources-35/.scalafmt.conf b/maven-projects/spark/datasources-35/.scalafmt.conf
new file mode 120000
index 000000000..4cb05e831
--- /dev/null
+++ b/maven-projects/spark/datasources-35/.scalafmt.conf
@@ -0,0 +1 @@
+../.scalafmt.conf
\ No newline at end of file
diff --git a/maven-projects/spark/datasources-35/pom.xml b/maven-projects/spark/datasources-35/pom.xml
new file mode 100644
index 000000000..db65159af
--- /dev/null
+++ b/maven-projects/spark/datasources-35/pom.xml
@@ -0,0 +1,198 @@
+
+
+
+
+ 4.0.0
+
+
+ org.apache.graphar
+ spark
+ ${graphar.version}
+ ../pom.xml
+
+
+ graphar-datasources
+ ${graphar.version}
+ jar
+
+
+
+ org.apache.spark
+ spark-core_${scala.binary.version}
+ ${spark.version}
+ provided
+
+
+ org.apache.spark
+ spark-streaming_${scala.binary.version}
+ ${spark.version}
+ provided
+
+
+ org.apache.spark
+ spark-mllib_${scala.binary.version}
+ ${spark.version}
+ provided
+
+
+ org.apache.spark
+ spark-sql_${scala.binary.version}
+ ${spark.version}
+ provided
+
+
+ org.apache.spark
+ spark-hive_${scala.binary.version}
+ ${spark.version}
+ provided
+
+
+ com.fasterxml.jackson.core
+ jackson-databind
+ ${jackson.version}
+
+
+
+
+
+
+ org.scala-tools
+ maven-scala-plugin
+ 2.15.2
+
+ ${scala.version}
+
+ -target:jvm-${maven.compiler.target}
+
+
+ -Xss4096K
+
+
+
+
+ scala-compile
+
+ compile
+
+
+
+ META-INF/*.SF
+ META-INF/*.DSA
+ META-INF/*.RSA
+
+
+
+
+ scala-test-compile
+
+ testCompile
+
+
+
+
+
+ net.alchim31.maven
+ scala-maven-plugin
+ 4.8.0
+
+
+
+ compile
+ testCompile
+
+
+
+
+
+ -Xms64m
+ -Xmx1024m
+
+
+ -Ywarn-unused
+
+
+
+ org.scalameta
+ semanticdb-scalac_${scala.version}
+ ${semanticdb-scalac.version}
+
+
+
+
+
+ com.diffplug.spotless
+ spotless-maven-plugin
+ 2.20.0
+
+
+
+
+
+
+ 1.13.0
+
+
+
+
+
+ ${project.basedir}/.scalafmt.conf
+
+
+
+
+
+ io.github.evis
+ scalafix-maven-plugin_2.13
+ 0.1.8_0.11.0
+
+
+ org.apache.maven.plugins
+ maven-source-plugin
+
+
+ attach-sources
+
+ jar
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-javadoc-plugin
+
+
+ attach-javadocs
+
+ jar
+
+
+
+
+
+ maven-site-plugin
+ 3.7.1
+
+
+
+
diff --git a/maven-projects/spark/datasources-35/src/main/java/org/apache/graphar/GeneralParams.java b/maven-projects/spark/datasources-35/src/main/java/org/apache/graphar/GeneralParams.java
new file mode 120000
index 000000000..a3915d619
--- /dev/null
+++ b/maven-projects/spark/datasources-35/src/main/java/org/apache/graphar/GeneralParams.java
@@ -0,0 +1 @@
+../../../../../../../graphar/src/main/java/org/apache/graphar/GeneralParams.java
\ No newline at end of file
diff --git a/maven-projects/spark/datasources-35/src/main/scala/org/apache/graphar/datasources/GarDataSource.scala b/maven-projects/spark/datasources-35/src/main/scala/org/apache/graphar/datasources/GarDataSource.scala
new file mode 100644
index 000000000..e502f82c6
--- /dev/null
+++ b/maven-projects/spark/datasources-35/src/main/scala/org/apache/graphar/datasources/GarDataSource.scala
@@ -0,0 +1,180 @@
+/* Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.graphar.datasources
+
+import scala.collection.JavaConverters._
+import scala.util.matching.Regex
+import java.util
+import com.fasterxml.jackson.databind.ObjectMapper
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.Path
+import org.apache.spark.sql.connector.catalog.{Table, TableProvider}
+import org.apache.spark.sql.execution.datasources._
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.execution.datasources.csv.CSVFileFormat
+import org.apache.spark.sql.execution.datasources.orc.OrcFileFormat
+import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat
+import org.apache.spark.sql.execution.datasources.json.JsonFileFormat
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.util.CaseInsensitiveStringMap
+import org.apache.spark.sql.sources.DataSourceRegister
+import org.apache.spark.sql.connector.expressions.Transform
+import org.apache.spark.sql.graphar.GarTable
+
+// Derived from Apache Spark 3.1.1
+// https://github.com/apache/spark/blob/1d550c4/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileDataSourceV2.scala
+
+/**
+ * GarDataSource is a class to provide gar files as the data source for spark.
+ */
+class GarDataSource extends TableProvider with DataSourceRegister {
+ private val REDACTION_REPLACEMENT_TEXT = "*********(redacted)"
+
+ /**
+ * Redact the sensitive information in the given string.
+ */
+ // Copy of redact from graphar Utils
+ private def redact(regex: Option[Regex], text: String): String = {
+ regex match {
+ case None => text
+ case Some(r) =>
+ if (text == null || text.isEmpty) {
+ text
+ } else {
+ r.replaceAllIn(text, REDACTION_REPLACEMENT_TEXT)
+ }
+ }
+ }
+
+ /** The default fallback file format is Parquet. */
+ def fallbackFileFormat: Class[_ <: FileFormat] = classOf[ParquetFileFormat]
+
+ lazy val sparkSession = SparkSession.active
+
+ /** The string that represents the format name. */
+ override def shortName(): String = "gar"
+
+ protected def getPaths(map: CaseInsensitiveStringMap): Seq[String] = {
+ val objectMapper = new ObjectMapper()
+ val paths = Option(map.get("paths"))
+ .map { pathStr =>
+ objectMapper.readValue(pathStr, classOf[Array[String]]).toSeq
+ }
+ .getOrElse(Seq.empty)
+ paths ++ Option(map.get("path")).toSeq
+ }
+
+ protected def getOptionsWithoutPaths(
+ map: CaseInsensitiveStringMap
+ ): CaseInsensitiveStringMap = {
+ val withoutPath = map.asCaseSensitiveMap().asScala.filterKeys { k =>
+ !k.equalsIgnoreCase("path") && !k.equalsIgnoreCase("paths")
+ }
+ new CaseInsensitiveStringMap(withoutPath.toMap.asJava)
+ }
+
+ protected def getTableName(
+ map: CaseInsensitiveStringMap,
+ paths: Seq[String]
+ ): String = {
+ val hadoopConf = sparkSession.sessionState.newHadoopConfWithOptions(
+ map.asCaseSensitiveMap().asScala.toMap
+ )
+ val name = shortName() + " " + paths
+ .map(qualifiedPathName(_, hadoopConf))
+ .mkString(",")
+ redact(sparkSession.sessionState.conf.stringRedactionPattern, name)
+ }
+
+ private def qualifiedPathName(
+ path: String,
+ hadoopConf: Configuration
+ ): String = {
+ val hdfsPath = new Path(path)
+ val fs = hdfsPath.getFileSystem(hadoopConf)
+ hdfsPath.makeQualified(fs.getUri, fs.getWorkingDirectory).toString
+ }
+
+ /** Provide a table from the data source. */
+ def getTable(options: CaseInsensitiveStringMap): Table = {
+ val paths = getPaths(options)
+ val tableName = getTableName(options, paths)
+ val optionsWithoutPaths = getOptionsWithoutPaths(options)
+ GarTable(
+ tableName,
+ sparkSession,
+ optionsWithoutPaths,
+ paths,
+ None,
+ getFallbackFileFormat(options)
+ )
+ }
+
+ /** Provide a table from the data source with specific schema. */
+ def getTable(options: CaseInsensitiveStringMap, schema: StructType): Table = {
+ val paths = getPaths(options)
+ val tableName = getTableName(options, paths)
+ val optionsWithoutPaths = getOptionsWithoutPaths(options)
+ GarTable(
+ tableName,
+ sparkSession,
+ optionsWithoutPaths,
+ paths,
+ Some(schema),
+ getFallbackFileFormat(options)
+ )
+ }
+
+ override def supportsExternalMetadata(): Boolean = true
+
+ private var t: Table = null
+
+ override def inferSchema(options: CaseInsensitiveStringMap): StructType = {
+ if (t == null) t = getTable(options)
+ t.schema()
+ }
+
+ override def inferPartitioning(
+ options: CaseInsensitiveStringMap
+ ): Array[Transform] = {
+ Array.empty
+ }
+
+ override def getTable(
+ schema: StructType,
+ partitioning: Array[Transform],
+ properties: util.Map[String, String]
+ ): Table = {
+ // If the table is already loaded during schema inference, return it directly.
+ if (t != null) {
+ t
+ } else {
+ getTable(new CaseInsensitiveStringMap(properties), schema)
+ }
+ }
+
+ // Get the actual fall back file format.
+ private def getFallbackFileFormat(
+ options: CaseInsensitiveStringMap
+ ): Class[_ <: FileFormat] = options.get("fileFormat") match {
+ case "csv" => classOf[CSVFileFormat]
+ case "orc" => classOf[OrcFileFormat]
+ case "parquet" => classOf[ParquetFileFormat]
+ case "json" => classOf[JsonFileFormat]
+ case _ => throw new IllegalArgumentException
+ }
+}
diff --git a/maven-projects/spark/datasources-35/src/main/scala/org/apache/spark/sql/graphar/GarCommitProtocol.scala b/maven-projects/spark/datasources-35/src/main/scala/org/apache/spark/sql/graphar/GarCommitProtocol.scala
new file mode 100644
index 000000000..c6ca79c21
--- /dev/null
+++ b/maven-projects/spark/datasources-35/src/main/scala/org/apache/spark/sql/graphar/GarCommitProtocol.scala
@@ -0,0 +1,97 @@
+/* Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Derived from Apache Spark 3.3.4
+// https://github.com/apache/spark/blob/18db204/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
+
+package org.apache.spark.sql.graphar
+
+import org.apache.graphar.GeneralParams
+
+import org.json4s._
+import org.json4s.jackson.JsonMethods._
+
+import org.apache.spark.sql.execution.datasources.SQLHadoopMapReduceCommitProtocol
+import org.apache.hadoop.mapreduce._
+import org.apache.spark.internal.Logging
+import org.apache.spark.internal.io.FileNameSpec
+
+object GarCommitProtocol {
+ private def binarySearchPair(aggNums: Array[Int], key: Int): (Int, Int) = {
+ var low = 0
+ var high = aggNums.length - 1
+ var mid = 0
+ while (low <= high) {
+ mid = (high + low) / 2;
+ if (
+ aggNums(mid) <= key && (mid == aggNums.length - 1 || aggNums(
+ mid + 1
+ ) > key)
+ ) {
+ return (mid, key - aggNums(mid))
+ } else if (aggNums(mid) > key) {
+ high = mid - 1
+ } else {
+ low = mid + 1
+ }
+ }
+ return (low, key - aggNums(low))
+ }
+}
+
+class GarCommitProtocol(
+ jobId: String,
+ path: String,
+ options: Map[String, String],
+ dynamicPartitionOverwrite: Boolean = false
+) extends SQLHadoopMapReduceCommitProtocol(
+ jobId,
+ path,
+ dynamicPartitionOverwrite
+ )
+ with Serializable
+ with Logging {
+
+ // override getFilename to customize the file name
+ override def getFilename(
+ taskContext: TaskAttemptContext,
+ spec: FileNameSpec
+ ): String = {
+ val partitionId = taskContext.getTaskAttemptID.getTaskID.getId
+ if (options.contains(GeneralParams.offsetStartChunkIndexKey)) {
+ // offset chunk file name, looks like chunk0
+ val chunk_index =
+ options(GeneralParams.offsetStartChunkIndexKey).toInt + partitionId
+ return f"chunk$chunk_index"
+ }
+ if (options.contains(GeneralParams.aggNumListOfEdgeChunkKey)) {
+ // edge chunk file name, looks like part0/chunk0
+ val jValue = parse(
+ options(GeneralParams.aggNumListOfEdgeChunkKey)
+ )
+ implicit val formats =
+ DefaultFormats // initialize a default formats for json4s
+ val aggNums: Array[Int] = Extraction.extract[Array[Int]](jValue)
+ val chunkPair: (Int, Int) =
+ GarCommitProtocol.binarySearchPair(aggNums, partitionId)
+ val vertex_chunk_index: Int = chunkPair._1
+ val edge_chunk_index: Int = chunkPair._2
+ return f"part$vertex_chunk_index/chunk$edge_chunk_index"
+ }
+ // vertex chunk file name, looks like chunk0
+ return f"chunk$partitionId"
+ }
+}
diff --git a/maven-projects/spark/datasources-35/src/main/scala/org/apache/spark/sql/graphar/GarScan.scala b/maven-projects/spark/datasources-35/src/main/scala/org/apache/spark/sql/graphar/GarScan.scala
new file mode 100644
index 000000000..11fd6a1dc
--- /dev/null
+++ b/maven-projects/spark/datasources-35/src/main/scala/org/apache/spark/sql/graphar/GarScan.scala
@@ -0,0 +1,361 @@
+/* Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Derived from Apache Spark 3.3.4
+// https://github.com/apache/spark/blob/18db204/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileScan.scala
+
+package org.apache.spark.sql.graphar
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.Path
+import org.apache.parquet.hadoop.ParquetInputFormat
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.catalyst.csv.CSVOptions
+import org.apache.spark.sql.catalyst.json.JSONOptionsInRead
+import org.apache.spark.sql.catalyst.expressions.{ExprUtils, Expression}
+import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
+import org.apache.spark.sql.connector.read.PartitionReaderFactory
+import org.apache.spark.sql.execution.PartitionedFileUtil
+import org.apache.spark.sql.execution.datasources.{
+ FilePartition,
+ PartitionedFile,
+ PartitioningAwareFileIndex
+}
+import org.apache.spark.sql.execution.datasources.parquet.{
+ ParquetOptions,
+ ParquetReadSupport,
+ ParquetWriteSupport
+}
+import org.apache.spark.sql.execution.datasources.v2.FileScan
+import org.apache.spark.sql.execution.datasources.v2.csv.CSVPartitionReaderFactory
+import org.apache.spark.sql.execution.datasources.v2.json.JsonPartitionReaderFactory
+import org.apache.spark.sql.execution.datasources.v2.orc.OrcPartitionReaderFactory
+import org.apache.spark.sql.execution.datasources.orc.OrcOptions
+import org.apache.spark.sql.execution.datasources.v2.parquet.ParquetPartitionReaderFactory
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.sources.Filter
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.util.CaseInsensitiveStringMap
+import org.apache.spark.util.SerializableConfiguration
+
+import scala.collection.mutable.ArrayBuffer
+import scala.jdk.CollectionConverters._
+import org.apache.spark.memory.MemoryMode
+
+/** GarScan is a class to implement the file scan for GarDataSource. */
+case class GarScan(
+ sparkSession: SparkSession,
+ hadoopConf: Configuration,
+ fileIndex: PartitioningAwareFileIndex,
+ dataSchema: StructType,
+ readDataSchema: StructType,
+ readPartitionSchema: StructType,
+ pushedFilters: Array[Filter],
+ options: CaseInsensitiveStringMap,
+ formatName: String,
+ partitionFilters: Seq[Expression] = Seq.empty,
+ dataFilters: Seq[Expression] = Seq.empty
+) extends FileScan {
+
+ /** The gar format is not splitable. */
+ override def isSplitable(path: Path): Boolean = false
+
+ /** Create the reader factory according to the actual file format. */
+ override def createReaderFactory(): PartitionReaderFactory =
+ formatName match {
+ case "csv" => createCSVReaderFactory()
+ case "orc" => createOrcReaderFactory()
+ case "parquet" => createParquetReaderFactory()
+ case "json" => createJSONReaderFactory()
+ case _ =>
+ throw new IllegalArgumentException("Invalid format name: " + formatName)
+ }
+
+ // Create the reader factory for the CSV format.
+ private def createCSVReaderFactory(): PartitionReaderFactory = {
+ val columnPruning = sparkSession.sessionState.conf.csvColumnPruning &&
+ !readDataSchema.exists(
+ _.name == sparkSession.sessionState.conf.columnNameOfCorruptRecord
+ )
+
+ val parsedOptions: CSVOptions = new CSVOptions(
+ options.asScala.toMap,
+ columnPruning = columnPruning,
+ sparkSession.sessionState.conf.sessionLocalTimeZone,
+ sparkSession.sessionState.conf.columnNameOfCorruptRecord
+ )
+
+ // Check a field requirement for corrupt records here to throw an exception in a driver side
+ ExprUtils.verifyColumnNameOfCorruptRecord(
+ dataSchema,
+ parsedOptions.columnNameOfCorruptRecord
+ )
+ // Don't push any filter which refers to the "virtual" column which cannot present in the input.
+ // Such filters will be applied later on the upper layer.
+ val actualFilters =
+ pushedFilters.filterNot(
+ _.references.contains(parsedOptions.columnNameOfCorruptRecord)
+ )
+
+ val caseSensitiveMap = options.asCaseSensitiveMap.asScala.toMap
+ // Hadoop Configurations are case sensitive.
+ val hadoopConf =
+ sparkSession.sessionState.newHadoopConfWithOptions(caseSensitiveMap)
+ val broadcastedConf = sparkSession.sparkContext.broadcast(
+ new SerializableConfiguration(hadoopConf)
+ )
+ // The partition values are already truncated in `FileScan.partitions`.
+ // We should use `readPartitionSchema` as the partition schema here.
+ CSVPartitionReaderFactory(
+ sparkSession.sessionState.conf,
+ broadcastedConf,
+ dataSchema,
+ readDataSchema,
+ readPartitionSchema,
+ parsedOptions,
+ actualFilters
+ )
+ }
+
+ // Create the reader factory for the Orc format.
+ private def createOrcReaderFactory(): PartitionReaderFactory = {
+ val broadcastedConf = sparkSession.sparkContext.broadcast(
+ new SerializableConfiguration(hadoopConf)
+ )
+ // The partition values are already truncated in `FileScan.partitions`.
+ // We should use `readPartitionSchema` as the partition schema here.
+ OrcPartitionReaderFactory(
+ sqlConf = sparkSession.sessionState.conf,
+ broadcastedConf = broadcastedConf,
+ dataSchema = dataSchema,
+ readDataSchema = readDataSchema,
+ partitionSchema = readPartitionSchema,
+ filters = pushedFilters,
+ aggregation = None,
+ options = new OrcOptions(
+ Map.empty[String, String],
+ sparkSession.sessionState.conf
+ ),
+ memoryMode = MemoryMode.ON_HEAP
+ )
+ }
+
+ // Create the reader factory for the Parquet format.
+ private def createParquetReaderFactory(): PartitionReaderFactory = {
+ val readDataSchemaAsJson = readDataSchema.json
+ hadoopConf.set(
+ ParquetInputFormat.READ_SUPPORT_CLASS,
+ classOf[ParquetReadSupport].getName
+ )
+ hadoopConf.set(
+ ParquetReadSupport.SPARK_ROW_REQUESTED_SCHEMA,
+ readDataSchemaAsJson
+ )
+ hadoopConf.set(ParquetWriteSupport.SPARK_ROW_SCHEMA, readDataSchemaAsJson)
+ hadoopConf.set(
+ SQLConf.SESSION_LOCAL_TIMEZONE.key,
+ sparkSession.sessionState.conf.sessionLocalTimeZone
+ )
+ hadoopConf.setBoolean(
+ SQLConf.NESTED_SCHEMA_PRUNING_ENABLED.key,
+ sparkSession.sessionState.conf.nestedSchemaPruningEnabled
+ )
+ hadoopConf.setBoolean(
+ SQLConf.CASE_SENSITIVE.key,
+ sparkSession.sessionState.conf.caseSensitiveAnalysis
+ )
+
+ ParquetWriteSupport.setSchema(readDataSchema, hadoopConf)
+
+ // Sets flags for `ParquetToSparkSchemaConverter`
+ hadoopConf.setBoolean(
+ SQLConf.PARQUET_BINARY_AS_STRING.key,
+ sparkSession.sessionState.conf.isParquetBinaryAsString
+ )
+ hadoopConf.setBoolean(
+ SQLConf.PARQUET_INT96_AS_TIMESTAMP.key,
+ sparkSession.sessionState.conf.isParquetINT96AsTimestamp
+ )
+ hadoopConf.setBoolean(
+ SQLConf.LEGACY_PARQUET_NANOS_AS_LONG.key,
+ sparkSession.sessionState.conf.legacyParquetNanosAsLong
+ )
+ hadoopConf.setBoolean(
+ SQLConf.PARQUET_FIELD_ID_READ_ENABLED.key,
+ sparkSession.sessionState.conf.parquetFieldIdReadEnabled
+ )
+ hadoopConf.setBoolean(
+ SQLConf.PARQUET_INFER_TIMESTAMP_NTZ_ENABLED.key,
+ sparkSession.sessionState.conf.parquetInferTimestampNTZEnabled
+ )
+
+ val broadcastedConf = sparkSession.sparkContext.broadcast(
+ new SerializableConfiguration(hadoopConf)
+ )
+ val sqlConf = sparkSession.sessionState.conf
+ ParquetPartitionReaderFactory(
+ sqlConf = sqlConf,
+ broadcastedConf = broadcastedConf,
+ dataSchema = dataSchema,
+ readDataSchema = readDataSchema,
+ partitionSchema = readPartitionSchema,
+ filters = pushedFilters,
+ aggregation = None,
+ new ParquetOptions(options.asCaseSensitiveMap.asScala.toMap, sqlConf)
+ )
+ }
+
+ // Create the reader factory for the JSON format.
+ private def createJSONReaderFactory(): PartitionReaderFactory = {
+ val parsedOptions = new JSONOptionsInRead(
+ CaseInsensitiveMap(options.asScala.toMap),
+ sparkSession.sessionState.conf.sessionLocalTimeZone,
+ sparkSession.sessionState.conf.columnNameOfCorruptRecord
+ )
+
+ // Check a field requirement for corrupt records here to throw an exception in a driver side
+ ExprUtils.verifyColumnNameOfCorruptRecord(
+ dataSchema,
+ parsedOptions.columnNameOfCorruptRecord
+ )
+ // Don't push any filter which refers to the "virtual" column which cannot present in the input.
+ // Such filters will be applied later on the upper layer.
+ val actualFilters =
+ pushedFilters.filterNot(
+ _.references.contains(parsedOptions.columnNameOfCorruptRecord)
+ )
+
+ val caseSensitiveMap = options.asCaseSensitiveMap.asScala.toMap
+ // Hadoop Configurations are case sensitive.
+ val hadoopConf =
+ sparkSession.sessionState.newHadoopConfWithOptions(caseSensitiveMap)
+ val broadcastedConf = sparkSession.sparkContext.broadcast(
+ new SerializableConfiguration(hadoopConf)
+ )
+ // The partition values are already truncated in `FileScan.partitions`.
+ // We should use `readPartitionSchema` as the partition schema here.
+ JsonPartitionReaderFactory(
+ sparkSession.sessionState.conf,
+ broadcastedConf,
+ dataSchema,
+ readDataSchema,
+ readPartitionSchema,
+ parsedOptions,
+ actualFilters
+ )
+ }
+
+ /**
+ * Override "partitions" of
+ * org.apache.spark.sql.execution.datasources.v2.FileScan to disable splitting
+ * and sort the files by file paths instead of by file sizes. Note: This
+ * implementation does not support to partition attributes.
+ */
+ override protected def partitions: Seq[FilePartition] = {
+ val selectedPartitions = fileIndex.listFiles(partitionFilters, dataFilters)
+ val maxSplitBytes =
+ FilePartition.maxSplitBytes(sparkSession, selectedPartitions)
+
+ val splitFiles = selectedPartitions.flatMap { partition =>
+ val partitionValues = partition.values
+ partition.files
+ .flatMap { file =>
+ val filePath = file.getPath
+ PartitionedFileUtil.splitFiles(
+ sparkSession = sparkSession,
+ file = file,
+ isSplitable = isSplitable(filePath),
+ maxSplitBytes = maxSplitBytes,
+ partitionValues = partitionValues
+ )
+ }
+ .toArray
+ .sortBy(_.filePath.toPath)
+ // starting from 3.4 PartitionedFile.filePath is SparkPath, not String
+ }
+
+ getFilePartitions(sparkSession, splitFiles)
+ }
+
+ /**
+ * Override "getFilePartitions" of
+ * org.apache.spark.sql.execution.datasources.FilePartition to assign each
+ * chunk file in GraphAr to a single partition.
+ */
+ private def getFilePartitions(
+ sparkSession: SparkSession,
+ partitionedFiles: Seq[PartitionedFile]
+ ): Seq[FilePartition] = {
+ val partitions = new ArrayBuffer[FilePartition]
+ val currentFiles = new ArrayBuffer[PartitionedFile]
+
+ /** Close the current partition and move to the next. */
+ def closePartition(): Unit = {
+ if (currentFiles.nonEmpty) {
+ // Copy to a new Array.
+ val newPartition = FilePartition(partitions.size, currentFiles.toArray)
+ partitions += newPartition
+ }
+ currentFiles.clear()
+ }
+ // Assign a file to each partition
+ partitionedFiles.foreach { file =>
+ closePartition()
+ // Add the given file to the current partition.
+ currentFiles += file
+ }
+ closePartition()
+ partitions.toSeq
+ }
+
+ /** Check if two objects are equal. */
+ override def equals(obj: Any): Boolean = obj match {
+ case g: GarScan =>
+ super.equals(g) && dataSchema == g.dataSchema && options == g.options &&
+ equivalentFilters(
+ pushedFilters,
+ g.pushedFilters
+ ) && formatName == g.formatName
+ case _ => false
+ }
+
+ /** Get the hash code of the object. */
+ override def hashCode(): Int = formatName match {
+ case "csv" => super.hashCode()
+ case "json" => super.hashCode()
+ case "orc" => getClass.hashCode()
+ case "parquet" => getClass.hashCode()
+ case _ =>
+ throw new IllegalArgumentException("Invalid format name: " + formatName)
+ }
+
+ /** Get the description string of the object. */
+ override def description(): String = {
+ super.description() + ", PushedFilters: " + seqToString(pushedFilters)
+ }
+
+ /** Get the meta data map of the object. */
+ override def getMetaData(): Map[String, String] = {
+ super.getMetaData() ++ Map("PushedFilters" -> seqToString(pushedFilters))
+ }
+
+ /** Construct the file scan with filters. */
+ def withFilters(
+ partitionFilters: Seq[Expression],
+ dataFilters: Seq[Expression]
+ ): FileScan =
+ this.copy(partitionFilters = partitionFilters, dataFilters = dataFilters)
+}
diff --git a/maven-projects/spark/datasources-35/src/main/scala/org/apache/spark/sql/graphar/GarScanBuilder.scala b/maven-projects/spark/datasources-35/src/main/scala/org/apache/spark/sql/graphar/GarScanBuilder.scala
new file mode 100644
index 000000000..706b72ae3
--- /dev/null
+++ b/maven-projects/spark/datasources-35/src/main/scala/org/apache/spark/sql/graphar/GarScanBuilder.scala
@@ -0,0 +1,109 @@
+/* Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Derived from Apache Spark 3.3.4
+// https://github.com/apache/spark/blob/18db204/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileScanBuilder.scala
+
+package org.apache.spark.sql.graphar
+
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.connector.read.Scan
+import org.apache.spark.sql.execution.datasources.PartitioningAwareFileIndex
+import org.apache.spark.sql.execution.datasources.v2.FileScanBuilder
+import org.apache.spark.sql.execution.datasources.v2.orc.OrcScanBuilder
+import org.apache.spark.sql.execution.datasources.v2.parquet.ParquetScanBuilder
+import org.apache.spark.sql.sources.Filter
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.util.CaseInsensitiveStringMap
+
+import scala.collection.JavaConverters._
+
+/** GarScanBuilder is a class to build the file scan for GarDataSource. */
+case class GarScanBuilder(
+ sparkSession: SparkSession,
+ fileIndex: PartitioningAwareFileIndex,
+ schema: StructType,
+ dataSchema: StructType,
+ options: CaseInsensitiveStringMap,
+ formatName: String
+) extends FileScanBuilder(sparkSession, fileIndex, dataSchema) {
+ lazy val hadoopConf = {
+ val caseSensitiveMap = options.asCaseSensitiveMap.asScala.toMap
+ // Hadoop Configurations are case sensitive.
+ sparkSession.sessionState.newHadoopConfWithOptions(caseSensitiveMap)
+ }
+
+ private var filters: Array[Filter] = Array.empty
+
+ override def pushDataFilters(dataFilters: Array[Filter]): Array[Filter] = {
+ this.filters = dataFilters
+ formatName match {
+ case "csv" => Array.empty[Filter]
+ case "json" => Array.empty[Filter]
+ case "orc" => pushedOrcFilters
+ case "parquet" => pushedParquetFilters
+ case _ =>
+ throw new IllegalArgumentException("Invalid format name: " + formatName)
+ }
+ }
+
+ private lazy val pushedParquetFilters: Array[Filter] = {
+ if (!sparkSession.sessionState.conf.parquetFilterPushDown) {
+ Array.empty[Filter]
+ } else {
+ val builder =
+ ParquetScanBuilder(sparkSession, fileIndex, schema, dataSchema, options)
+ builder.pushDataFilters(this.filters)
+ }
+ }
+
+ private lazy val pushedOrcFilters: Array[Filter] = {
+ if (!sparkSession.sessionState.conf.orcFilterPushDown) {
+ Array.empty[Filter]
+ } else {
+ val builder =
+ OrcScanBuilder(sparkSession, fileIndex, schema, dataSchema, options)
+ builder.pushDataFilters(this.filters)
+ }
+ }
+
+ // Check if the file format supports nested schema pruning.
+ override protected val supportsNestedSchemaPruning: Boolean =
+ formatName match {
+ case "csv" => false
+ case "json" => false
+ case "orc" => sparkSession.sessionState.conf.nestedSchemaPruningEnabled
+ case "parquet" =>
+ sparkSession.sessionState.conf.nestedSchemaPruningEnabled
+ case _ =>
+ throw new IllegalArgumentException("Invalid format name: " + formatName)
+ }
+
+ /** Build the file scan for GarDataSource. */
+ override def build(): Scan = {
+ GarScan(
+ sparkSession,
+ hadoopConf,
+ fileIndex,
+ dataSchema,
+ readDataSchema(),
+ readPartitionSchema(),
+ pushedDataFilters,
+ options,
+ formatName
+ )
+ }
+}
diff --git a/maven-projects/spark/datasources-35/src/main/scala/org/apache/spark/sql/graphar/GarTable.scala b/maven-projects/spark/datasources-35/src/main/scala/org/apache/spark/sql/graphar/GarTable.scala
new file mode 100644
index 000000000..df874ea32
--- /dev/null
+++ b/maven-projects/spark/datasources-35/src/main/scala/org/apache/spark/sql/graphar/GarTable.scala
@@ -0,0 +1,150 @@
+/* Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Derived from Apache Spark 3.1.1
+// https://github.com/apache/spark/blob/1d550c4/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileTable.scala
+
+package org.apache.spark.sql.graphar
+
+import org.apache.hadoop.fs.FileStatus
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.catalyst.csv.CSVOptions
+import org.apache.spark.sql.connector.write.{LogicalWriteInfo, WriteBuilder}
+import org.apache.spark.sql.execution.datasources.FileFormat
+import org.apache.spark.sql.execution.datasources.csv.CSVDataSource
+import org.apache.spark.sql.execution.datasources.orc.OrcUtils
+import org.apache.spark.sql.execution.datasources.parquet.ParquetUtils
+import org.apache.spark.sql.execution.datasources.v2.FileTable
+import org.apache.spark.sql.graphar.csv.CSVWriteBuilder
+import org.apache.spark.sql.graphar.orc.OrcWriteBuilder
+import org.apache.spark.sql.graphar.parquet.ParquetWriteBuilder
+import org.apache.spark.sql.graphar.json.JSONWriteBuilder
+import org.apache.spark.sql.types._
+import org.apache.spark.sql.util.CaseInsensitiveStringMap
+import org.apache.spark.sql.execution.datasources.json.JsonDataSource
+import org.apache.spark.sql.catalyst.json.JSONOptions
+
+import scala.collection.JavaConverters._
+
+/** GarTable is a class to represent the graph data in GraphAr as a table. */
+case class GarTable(
+ name: String,
+ sparkSession: SparkSession,
+ options: CaseInsensitiveStringMap,
+ paths: Seq[String],
+ userSpecifiedSchema: Option[StructType],
+ fallbackFileFormat: Class[_ <: FileFormat]
+) extends FileTable(sparkSession, options, paths, userSpecifiedSchema) {
+
+ /** Construct a new scan builder. */
+ override def newScanBuilder(
+ options: CaseInsensitiveStringMap
+ ): GarScanBuilder =
+ new GarScanBuilder(
+ sparkSession,
+ fileIndex,
+ schema,
+ dataSchema,
+ options,
+ formatName
+ )
+
+ /**
+ * Infer the schema of the table through the methods of the actual file
+ * format.
+ */
+ override def inferSchema(files: Seq[FileStatus]): Option[StructType] =
+ formatName match {
+ case "csv" => {
+ val parsedOptions = new CSVOptions(
+ options.asScala.toMap,
+ columnPruning = sparkSession.sessionState.conf.csvColumnPruning,
+ sparkSession.sessionState.conf.sessionLocalTimeZone
+ )
+
+ CSVDataSource(parsedOptions).inferSchema(
+ sparkSession,
+ files,
+ parsedOptions
+ )
+ }
+ case "orc" =>
+ OrcUtils.inferSchema(sparkSession, files, options.asScala.toMap)
+ case "parquet" =>
+ ParquetUtils.inferSchema(sparkSession, options.asScala.toMap, files)
+ case "json" => {
+ val parsedOptions = new JSONOptions(
+ options.asScala.toMap,
+ sparkSession.sessionState.conf.sessionLocalTimeZone
+ )
+
+ JsonDataSource(parsedOptions).inferSchema(
+ sparkSession,
+ files,
+ parsedOptions
+ )
+ }
+ case _ =>
+ throw new IllegalArgumentException("Invalid format name: " + formatName)
+
+ }
+
+ /** Construct a new write builder according to the actual file format. */
+ override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder =
+ formatName match {
+ case "csv" =>
+ new CSVWriteBuilder(paths, formatName, supportsDataType, info)
+ case "orc" =>
+ new OrcWriteBuilder(paths, formatName, supportsDataType, info)
+ case "parquet" =>
+ new ParquetWriteBuilder(paths, formatName, supportsDataType, info)
+ case "json" =>
+ new JSONWriteBuilder(paths, formatName, supportsDataType, info)
+ case _ =>
+ throw new IllegalArgumentException("Invalid format name: " + formatName)
+ }
+
+ /**
+ * Check if a data type is supported. Note: Currently, the GraphAr data source
+ * only supports several atomic data types. To support additional data types
+ * such as Struct, Array and Map, revise this function to handle them case by
+ * case as the commented code shows.
+ */
+ override def supportsDataType(dataType: DataType): Boolean = dataType match {
+ // case _: AnsiIntervalType => false
+
+ case _: AtomicType => true
+
+ // case st: StructType => st.forall { f => supportsDataType(f.dataType) }
+
+ case ArrayType(elementType, _) =>
+ formatName match {
+ case "orc" => supportsDataType(elementType)
+ case "parquet" => supportsDataType(elementType)
+ case _ => false
+ }
+
+ // case MapType(keyType, valueType, _) =>
+ // supportsDataType(keyType) && supportsDataType(valueType)
+
+ // case udt: UserDefinedType[_] => supportsDataType(udt.sqlType)
+
+ case _ => false
+ }
+
+ /** The actual file format for storing the data in GraphAr. */
+ override def formatName: String = options.get("fileFormat")
+}
diff --git a/maven-projects/spark/datasources-35/src/main/scala/org/apache/spark/sql/graphar/GarWriteBuilder.scala b/maven-projects/spark/datasources-35/src/main/scala/org/apache/spark/sql/graphar/GarWriteBuilder.scala
new file mode 100644
index 000000000..58f1890da
--- /dev/null
+++ b/maven-projects/spark/datasources-35/src/main/scala/org/apache/spark/sql/graphar/GarWriteBuilder.scala
@@ -0,0 +1,176 @@
+/* Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Derived from Apache Spark 3.3.4
+// https://github.com/apache/spark/blob/18db204/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileWriteBuilder.scala
+
+package org.apache.spark.sql.graphar
+
+import java.util.UUID
+
+import scala.collection.JavaConverters._
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.Path
+import org.apache.hadoop.mapreduce.Job
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
+
+import org.apache.spark.sql.execution.datasources.OutputWriterFactory
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, DateTimeUtils}
+import org.apache.spark.sql.connector.write.{
+ BatchWrite,
+ LogicalWriteInfo,
+ WriteBuilder
+}
+import org.apache.spark.sql.execution.datasources.{
+ BasicWriteJobStatsTracker,
+ DataSource,
+ WriteJobDescription
+}
+import org.apache.spark.sql.execution.metric.SQLMetric
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.types.{DataType, StructType}
+import org.apache.spark.util.SerializableConfiguration
+import org.apache.spark.sql.execution.datasources.v2.FileBatchWrite
+import org.apache.spark.sql.catalyst.expressions.AttributeReference
+
+abstract class GarWriteBuilder(
+ paths: Seq[String],
+ formatName: String,
+ supportsDataType: DataType => Boolean,
+ info: LogicalWriteInfo
+) extends WriteBuilder {
+ private val schema = info.schema()
+ private val queryId = info.queryId()
+ private val options = info.options()
+
+ override def buildForBatch(): BatchWrite = {
+ val sparkSession = SparkSession.active
+ validateInputs(sparkSession.sessionState.conf.caseSensitiveAnalysis)
+ val path = new Path(paths.head)
+ val caseSensitiveMap = options.asCaseSensitiveMap.asScala.toMap
+ // Hadoop Configurations are case sensitive.
+ val hadoopConf =
+ sparkSession.sessionState.newHadoopConfWithOptions(caseSensitiveMap)
+ val job = getJobInstance(hadoopConf, path)
+ val committer = new GarCommitProtocol(
+ java.util.UUID.randomUUID().toString,
+ paths.head,
+ options.asScala.toMap,
+ false
+ )
+ lazy val description =
+ createWriteJobDescription(
+ sparkSession,
+ hadoopConf,
+ job,
+ paths.head,
+ options.asScala.toMap
+ )
+
+ committer.setupJob(job)
+ new FileBatchWrite(job, description, committer)
+ }
+
+ def prepareWrite(
+ sqlConf: SQLConf,
+ job: Job,
+ options: Map[String, String],
+ dataSchema: StructType
+ ): OutputWriterFactory
+
+ private def validateInputs(caseSensitiveAnalysis: Boolean): Unit = {
+ assert(schema != null, "Missing input data schema")
+ assert(queryId != null, "Missing query ID")
+
+ if (paths.length != 1) {
+ throw new IllegalArgumentException(
+ "Expected exactly one path to be specified, but " +
+ s"got: ${paths.mkString(", ")}"
+ )
+ }
+ val pathName = paths.head
+ val sqlConf = SparkSession.active.sessionState.conf
+ DataSource.validateSchema(schema, sqlConf)
+
+ schema.foreach { field =>
+ if (!supportsDataType(field.dataType)) {
+ throw new IllegalArgumentException(
+ s"$formatName data source does not support ${field.dataType.catalogString} data type."
+ )
+ }
+ }
+ }
+
+ private def getJobInstance(hadoopConf: Configuration, path: Path): Job = {
+ val job = Job.getInstance(hadoopConf)
+ job.setOutputKeyClass(classOf[Void])
+ job.setOutputValueClass(classOf[InternalRow])
+ FileOutputFormat.setOutputPath(job, path)
+ job
+ }
+
+ private def createWriteJobDescription(
+ sparkSession: SparkSession,
+ hadoopConf: Configuration,
+ job: Job,
+ pathName: String,
+ options: Map[String, String]
+ ): WriteJobDescription = {
+ val caseInsensitiveOptions = CaseInsensitiveMap(options)
+ // Note: prepareWrite has side effect. It sets "job".
+ val outputWriterFactory =
+ prepareWrite(
+ sparkSession.sessionState.conf,
+ job,
+ caseInsensitiveOptions,
+ schema
+ )
+ // same as schema.toAttributes which is private of spark package
+ val allColumns: Seq[AttributeReference] = schema.map(f =>
+ AttributeReference(f.name, f.dataType, f.nullable, f.metadata)()
+ )
+ val metrics: Map[String, SQLMetric] = BasicWriteJobStatsTracker.metrics
+ val serializableHadoopConf = new SerializableConfiguration(hadoopConf)
+ val statsTracker =
+ new BasicWriteJobStatsTracker(serializableHadoopConf, metrics)
+ // TODO: after partitioning is supported in V2:
+ // 1. filter out partition columns in `dataColumns`.
+ // 2. Don't use Seq.empty for `partitionColumns`.
+ new WriteJobDescription(
+ uuid = UUID.randomUUID().toString,
+ serializableHadoopConf =
+ new SerializableConfiguration(job.getConfiguration),
+ outputWriterFactory = outputWriterFactory,
+ allColumns = allColumns,
+ dataColumns = allColumns,
+ partitionColumns = Seq.empty,
+ bucketSpec = None,
+ path = pathName,
+ customPartitionLocations = Map.empty,
+ maxRecordsPerFile = caseInsensitiveOptions
+ .get("maxRecordsPerFile")
+ .map(_.toLong)
+ .getOrElse(sparkSession.sessionState.conf.maxRecordsPerFile),
+ timeZoneId = caseInsensitiveOptions
+ .get(DateTimeUtils.TIMEZONE_OPTION)
+ .getOrElse(sparkSession.sessionState.conf.sessionLocalTimeZone),
+ statsTrackers = Seq(statsTracker)
+ )
+ }
+}
diff --git a/maven-projects/spark/datasources-35/src/main/scala/org/apache/spark/sql/graphar/csv/CSVWriteBuilder.scala b/maven-projects/spark/datasources-35/src/main/scala/org/apache/spark/sql/graphar/csv/CSVWriteBuilder.scala
new file mode 100644
index 000000000..68e156e07
--- /dev/null
+++ b/maven-projects/spark/datasources-35/src/main/scala/org/apache/spark/sql/graphar/csv/CSVWriteBuilder.scala
@@ -0,0 +1,72 @@
+/* Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Derived from Apache Spark 3.1.1
+// https://github.com/apache/spark/blob/1d550c4/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/csv/CSVWriteBuilder.scala
+
+package org.apache.spark.sql.graphar.csv
+
+import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext}
+import org.apache.spark.sql.catalyst.csv.CSVOptions
+import org.apache.spark.sql.catalyst.util.CompressionCodecs
+import org.apache.spark.sql.connector.write.LogicalWriteInfo
+import org.apache.spark.sql.execution.datasources.{
+ CodecStreams,
+ OutputWriter,
+ OutputWriterFactory
+}
+import org.apache.spark.sql.execution.datasources.csv.CsvOutputWriter
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.types.{DataType, StructType}
+import org.apache.spark.sql.graphar.GarWriteBuilder
+
+class CSVWriteBuilder(
+ paths: Seq[String],
+ formatName: String,
+ supportsDataType: DataType => Boolean,
+ info: LogicalWriteInfo
+) extends GarWriteBuilder(paths, formatName, supportsDataType, info) {
+ override def prepareWrite(
+ sqlConf: SQLConf,
+ job: Job,
+ options: Map[String, String],
+ dataSchema: StructType
+ ): OutputWriterFactory = {
+ val conf = job.getConfiguration
+ val csvOptions = new CSVOptions(
+ options,
+ columnPruning = sqlConf.csvColumnPruning,
+ sqlConf.sessionLocalTimeZone
+ )
+ csvOptions.compressionCodec.foreach { codec =>
+ CompressionCodecs.setCodecConfiguration(conf, codec)
+ }
+
+ new OutputWriterFactory {
+ override def newInstance(
+ path: String,
+ dataSchema: StructType,
+ context: TaskAttemptContext
+ ): OutputWriter = {
+ new CsvOutputWriter(path, dataSchema, context, csvOptions)
+ }
+
+ override def getFileExtension(context: TaskAttemptContext): String = {
+ ".csv" + CodecStreams.getCompressionExtension(context)
+ }
+ }
+ }
+}
diff --git a/maven-projects/spark/datasources-35/src/main/scala/org/apache/spark/sql/graphar/json/JSONWriteBuilder.scala b/maven-projects/spark/datasources-35/src/main/scala/org/apache/spark/sql/graphar/json/JSONWriteBuilder.scala
new file mode 100644
index 000000000..150a9a9f8
--- /dev/null
+++ b/maven-projects/spark/datasources-35/src/main/scala/org/apache/spark/sql/graphar/json/JSONWriteBuilder.scala
@@ -0,0 +1,73 @@
+/* Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Derived from Apache Spark 3.5.1
+// https://github.com/apache/spark/blob/1d550c4/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/json/JsonWriteBuilder.scala
+
+package org.apache.spark.sql.graphar.json
+import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext}
+import org.apache.spark.sql.catalyst.util.CompressionCodecs
+import org.apache.spark.sql.connector.write.LogicalWriteInfo
+import org.apache.spark.sql.execution.datasources.json.JsonOutputWriter
+import org.apache.spark.sql.execution.datasources.{
+ CodecStreams,
+ OutputWriter,
+ OutputWriterFactory
+}
+
+import org.apache.spark.sql.catalyst.json.JSONOptions
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.types.{StructType, DataType}
+
+import org.apache.spark.sql.graphar.GarWriteBuilder
+
+class JSONWriteBuilder(
+ paths: Seq[String],
+ formatName: String,
+ supportsDataType: DataType => Boolean,
+ info: LogicalWriteInfo
+) extends GarWriteBuilder(paths, formatName, supportsDataType, info) {
+ override def prepareWrite(
+ sqlConf: SQLConf,
+ job: Job,
+ options: Map[String, String],
+ dataSchema: StructType
+ ): OutputWriterFactory = {
+ val conf = job.getConfiguration
+ val parsedOptions = new JSONOptions(
+ options,
+ sqlConf.sessionLocalTimeZone,
+ sqlConf.columnNameOfCorruptRecord
+ )
+ parsedOptions.compressionCodec.foreach { codec =>
+ CompressionCodecs.setCodecConfiguration(conf, codec)
+ }
+
+ new OutputWriterFactory {
+ override def newInstance(
+ path: String,
+ dataSchema: StructType,
+ context: TaskAttemptContext
+ ): OutputWriter = {
+ new JsonOutputWriter(path, parsedOptions, dataSchema, context)
+ }
+
+ override def getFileExtension(context: TaskAttemptContext): String = {
+ ".json" + CodecStreams.getCompressionExtension(context)
+ }
+ }
+ }
+}
diff --git a/maven-projects/spark/datasources-35/src/main/scala/org/apache/spark/sql/graphar/orc/OrcOutputWriter.scala b/maven-projects/spark/datasources-35/src/main/scala/org/apache/spark/sql/graphar/orc/OrcOutputWriter.scala
new file mode 100644
index 000000000..ccc7a48e1
--- /dev/null
+++ b/maven-projects/spark/datasources-35/src/main/scala/org/apache/spark/sql/graphar/orc/OrcOutputWriter.scala
@@ -0,0 +1,70 @@
+/* Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Derived from Apache Spark 3.1.1, since the OrcOutputWriter is private in the original source,
+// we have to reimplement it here.
+// https://github.com/apache/spark/blob/1d550c4/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcOutputWriter.scala
+
+package org.apache.spark.sql.graphar.orc
+
+import org.apache.hadoop.fs.Path
+import org.apache.hadoop.io.NullWritable
+import org.apache.hadoop.mapreduce.TaskAttemptContext
+import org.apache.orc.OrcFile
+import org.apache.orc.mapred.{
+ OrcOutputFormat => OrcMapRedOutputFormat,
+ OrcStruct
+}
+import org.apache.orc.mapreduce.{OrcMapreduceRecordWriter, OrcOutputFormat}
+
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.execution.datasources.OutputWriter
+import org.apache.spark.sql.execution.datasources.orc.{OrcSerializer, OrcUtils}
+import org.apache.spark.sql.types._
+
+class OrcOutputWriter(
+ val path: String,
+ dataSchema: StructType,
+ context: TaskAttemptContext
+) extends OutputWriter {
+
+ private[this] val serializer = new OrcSerializer(dataSchema)
+
+ private val recordWriter = {
+ val orcOutputFormat = new OrcOutputFormat[OrcStruct]() {
+ override def getDefaultWorkFile(
+ context: TaskAttemptContext,
+ extension: String
+ ): Path = {
+ new Path(path)
+ }
+ }
+ val filename = orcOutputFormat.getDefaultWorkFile(context, ".orc")
+ val options = OrcMapRedOutputFormat.buildOptions(context.getConfiguration)
+ val writer = OrcFile.createWriter(filename, options)
+ val recordWriter = new OrcMapreduceRecordWriter[OrcStruct](writer)
+ OrcUtils.addSparkVersionMetadata(writer)
+ recordWriter
+ }
+
+ override def write(row: InternalRow): Unit = {
+ recordWriter.write(NullWritable.get(), serializer.serialize(row))
+ }
+
+ override def close(): Unit = {
+ recordWriter.close(context)
+ }
+}
diff --git a/maven-projects/spark/datasources-35/src/main/scala/org/apache/spark/sql/graphar/orc/OrcWriteBuilder.scala b/maven-projects/spark/datasources-35/src/main/scala/org/apache/spark/sql/graphar/orc/OrcWriteBuilder.scala
new file mode 100644
index 000000000..287162f8e
--- /dev/null
+++ b/maven-projects/spark/datasources-35/src/main/scala/org/apache/spark/sql/graphar/orc/OrcWriteBuilder.scala
@@ -0,0 +1,104 @@
+/* Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Derived from Apache Spark 3.1.1
+// https://github.com/apache/spark/blob/1d550c4/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/orc/ORCWriteBuilder.scala
+
+package org.apache.spark.sql.graphar.orc
+
+import org.apache.hadoop.mapred.JobConf
+import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext}
+import org.apache.orc.OrcConf.{COMPRESS, MAPRED_OUTPUT_SCHEMA}
+import org.apache.orc.mapred.OrcStruct
+
+import org.apache.spark.sql.connector.write.LogicalWriteInfo
+import org.apache.spark.sql.execution.datasources.{
+ OutputWriter,
+ OutputWriterFactory
+}
+import org.apache.spark.sql.execution.datasources.orc.{OrcOptions, OrcUtils}
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.types._
+
+import org.apache.spark.sql.graphar.GarWriteBuilder
+
+object OrcWriteBuilder {
+ // the getQuotedSchemaString method of spark OrcFileFormat
+ private def getQuotedSchemaString(dataType: DataType): String =
+ dataType match {
+ case StructType(fields) =>
+ fields
+ .map(f => s"`${f.name}`:${getQuotedSchemaString(f.dataType)}")
+ .mkString("struct<", ",", ">")
+ case ArrayType(elementType, _) =>
+ s"array<${getQuotedSchemaString(elementType)}>"
+ case MapType(keyType, valueType, _) =>
+ s"map<${getQuotedSchemaString(keyType)},${getQuotedSchemaString(valueType)}>"
+ case _ => // UDT and others
+ dataType.catalogString
+ }
+}
+
+class OrcWriteBuilder(
+ paths: Seq[String],
+ formatName: String,
+ supportsDataType: DataType => Boolean,
+ info: LogicalWriteInfo
+) extends GarWriteBuilder(paths, formatName, supportsDataType, info) {
+
+ override def prepareWrite(
+ sqlConf: SQLConf,
+ job: Job,
+ options: Map[String, String],
+ dataSchema: StructType
+ ): OutputWriterFactory = {
+ val orcOptions = new OrcOptions(options, sqlConf)
+
+ val conf = job.getConfiguration
+
+ conf.set(
+ MAPRED_OUTPUT_SCHEMA.getAttribute,
+ OrcWriteBuilder.getQuotedSchemaString(dataSchema)
+ )
+
+ conf.set(COMPRESS.getAttribute, orcOptions.compressionCodec)
+
+ conf
+ .asInstanceOf[JobConf]
+ .setOutputFormat(
+ classOf[org.apache.orc.mapred.OrcOutputFormat[OrcStruct]]
+ )
+
+ new OutputWriterFactory {
+ override def newInstance(
+ path: String,
+ dataSchema: StructType,
+ context: TaskAttemptContext
+ ): OutputWriter = {
+ new OrcOutputWriter(path, dataSchema, context)
+ }
+
+ override def getFileExtension(context: TaskAttemptContext): String = {
+ val compressionExtension: String = {
+ val name = context.getConfiguration.get(COMPRESS.getAttribute)
+ OrcUtils.extensionsForCompressionCodecNames.getOrElse(name, "")
+ }
+
+ compressionExtension + ".orc"
+ }
+ }
+ }
+}
diff --git a/maven-projects/spark/datasources-35/src/main/scala/org/apache/spark/sql/graphar/parquet/ParquetWriteBuilder.scala b/maven-projects/spark/datasources-35/src/main/scala/org/apache/spark/sql/graphar/parquet/ParquetWriteBuilder.scala
new file mode 100644
index 000000000..8e53dc5f8
--- /dev/null
+++ b/maven-projects/spark/datasources-35/src/main/scala/org/apache/spark/sql/graphar/parquet/ParquetWriteBuilder.scala
@@ -0,0 +1,152 @@
+/* Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Derived from Apache Spark 3.1.1
+// https://github.com/apache/spark/blob/1d550c4/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/parquet/ParquetWriteBuilder.scala
+
+package org.apache.spark.sql.graphar.parquet
+
+import org.apache.hadoop.mapreduce.{Job, OutputCommitter, TaskAttemptContext}
+import org.apache.parquet.hadoop.{ParquetOutputCommitter, ParquetOutputFormat}
+import org.apache.parquet.hadoop.ParquetOutputFormat.JobSummaryLevel
+import org.apache.parquet.hadoop.codec.CodecConfig
+import org.apache.parquet.hadoop.util.ContextUtil
+
+import org.apache.spark.internal.Logging
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.connector.write.LogicalWriteInfo
+import org.apache.spark.sql.execution.datasources.{
+ OutputWriter,
+ OutputWriterFactory
+}
+import org.apache.spark.sql.execution.datasources.parquet._
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.types._
+
+import org.apache.spark.sql.graphar.GarWriteBuilder
+
+class ParquetWriteBuilder(
+ paths: Seq[String],
+ formatName: String,
+ supportsDataType: DataType => Boolean,
+ info: LogicalWriteInfo
+) extends GarWriteBuilder(paths, formatName, supportsDataType, info)
+ with Logging {
+
+ override def prepareWrite(
+ sqlConf: SQLConf,
+ job: Job,
+ options: Map[String, String],
+ dataSchema: StructType
+ ): OutputWriterFactory = {
+ val parquetOptions = new ParquetOptions(options, sqlConf)
+
+ val conf = ContextUtil.getConfiguration(job)
+
+ val committerClass =
+ conf.getClass(
+ SQLConf.PARQUET_OUTPUT_COMMITTER_CLASS.key,
+ classOf[ParquetOutputCommitter],
+ classOf[OutputCommitter]
+ )
+
+ if (conf.get(SQLConf.PARQUET_OUTPUT_COMMITTER_CLASS.key) == null) {
+ logInfo(
+ "Using default output committer for Parquet: " +
+ classOf[ParquetOutputCommitter].getCanonicalName
+ )
+ } else {
+ logInfo(
+ "Using user defined output committer for Parquet: " + committerClass.getCanonicalName
+ )
+ }
+
+ conf.setClass(
+ SQLConf.OUTPUT_COMMITTER_CLASS.key,
+ committerClass,
+ classOf[OutputCommitter]
+ )
+
+ // We're not really using `ParquetOutputFormat[Row]` for writing data here, because we override
+ // it in `ParquetOutputWriter` to support appending and dynamic partitioning. The reason why
+ // we set it here is to setup the output committer class to `ParquetOutputCommitter`, which is
+ // bundled with `ParquetOutputFormat[Row]`.
+ job.setOutputFormatClass(classOf[ParquetOutputFormat[Row]])
+
+ ParquetOutputFormat.setWriteSupportClass(job, classOf[ParquetWriteSupport])
+
+ // This metadata is useful for keeping UDTs like Vector/Matrix.
+ ParquetWriteSupport.setSchema(dataSchema, conf)
+
+ // Sets flags for `ParquetWriteSupport`, which converts Catalyst schema to Parquet
+ // schema and writes actual rows to Parquet files.
+ conf.set(
+ SQLConf.PARQUET_WRITE_LEGACY_FORMAT.key,
+ sqlConf.writeLegacyParquetFormat.toString
+ )
+
+ conf.set(
+ SQLConf.PARQUET_OUTPUT_TIMESTAMP_TYPE.key,
+ sqlConf.parquetOutputTimestampType.toString
+ )
+
+ // Sets compression scheme
+ conf.set(
+ ParquetOutputFormat.COMPRESSION,
+ parquetOptions.compressionCodecClassName
+ )
+
+ // ParquetOutputWriter required fields starting from 3.3.x
+ conf.set(
+ SQLConf.PARQUET_FIELD_ID_WRITE_ENABLED.key,
+ sqlConf.parquetFieldIdWriteEnabled.toString
+ )
+
+ // SPARK-15719: Disables writing Parquet summary files by default.
+ if (
+ conf.get(ParquetOutputFormat.JOB_SUMMARY_LEVEL) == null
+ && conf.get(ParquetOutputFormat.ENABLE_JOB_SUMMARY) == null
+ ) {
+ conf.setEnum(ParquetOutputFormat.JOB_SUMMARY_LEVEL, JobSummaryLevel.NONE)
+ }
+
+ if (
+ ParquetOutputFormat.getJobSummaryLevel(conf) == JobSummaryLevel.NONE
+ && !classOf[ParquetOutputCommitter].isAssignableFrom(committerClass)
+ ) {
+ // output summary is requested, but the class is not a Parquet Committer
+ logWarning(
+ s"Committer $committerClass is not a ParquetOutputCommitter and cannot" +
+ s" create job summaries. " +
+ s"Set Parquet option ${ParquetOutputFormat.JOB_SUMMARY_LEVEL} to NONE."
+ )
+ }
+
+ new OutputWriterFactory {
+ override def newInstance(
+ path: String,
+ dataSchema: StructType,
+ context: TaskAttemptContext
+ ): OutputWriter = {
+ new ParquetOutputWriter(path, context)
+ }
+
+ override def getFileExtension(context: TaskAttemptContext): String = {
+ CodecConfig.from(context).getCodec.getExtension + ".parquet"
+ }
+ }
+ }
+}
diff --git a/maven-projects/spark/graphar/pom.xml b/maven-projects/spark/graphar/pom.xml
index 9dc46eb49..045b27fc1 100644
--- a/maven-projects/spark/graphar/pom.xml
+++ b/maven-projects/spark/graphar/pom.xml
@@ -90,7 +90,7 @@
org.neo4j
- neo4j-connector-apache-spark_2.12
+ neo4j-connector-apache-spark_${scala.binary.version}
5.0.0_for_spark_3
@@ -100,7 +100,7 @@
org.scala-lang.modules
- scala-collection-compat_2.12
+ scala-collection-compat_${scala.binary.version}
2.1.1
@@ -113,7 +113,7 @@
${scala.version}
- -target:jvm-1.8
+ -target:jvm-${maven.compiler.target}
-Xss4096K
@@ -218,8 +218,8 @@
org.scalameta
- semanticdb-scalac_2.12.10
- 4.3.24
+ semanticdb-scalac_${scala.version}
+ ${semanticdb-scalac.version}
diff --git a/maven-projects/spark/graphar/src/main/scala/org/apache/graphar/graph/GraphReader.scala b/maven-projects/spark/graphar/src/main/scala/org/apache/graphar/graph/GraphReader.scala
index 3fbd56aa8..251e1b203 100644
--- a/maven-projects/spark/graphar/src/main/scala/org/apache/graphar/graph/GraphReader.scala
+++ b/maven-projects/spark/graphar/src/main/scala/org/apache/graphar/graph/GraphReader.scala
@@ -118,10 +118,13 @@ object GraphReader {
def readWithGraphInfo(
graphInfo: GraphInfo,
spark: SparkSession
- ): Pair[Map[String, DataFrame], Map[
- (String, String, String),
- Map[String, DataFrame]
- ]] = {
+ ): (
+ Map[String, DataFrame],
+ Map[
+ (String, String, String),
+ Map[String, DataFrame]
+ ]
+ ) = {
val prefix = graphInfo.getPrefix
val vertex_infos = graphInfo.getVertexInfos()
val edge_infos = graphInfo.getEdgeInfos()
@@ -148,10 +151,13 @@ object GraphReader {
def read(
graphInfoPath: String,
spark: SparkSession
- ): Pair[Map[String, DataFrame], Map[
- (String, String, String),
- Map[String, DataFrame]
- ]] = {
+ ): (
+ Map[String, DataFrame],
+ Map[
+ (String, String, String),
+ Map[String, DataFrame]
+ ]
+ ) = {
// load graph info
val graph_info = GraphInfo.loadGraphInfo(graphInfoPath, spark)
diff --git a/maven-projects/spark/graphar/src/main/scala/org/apache/graphar/graph/GraphWriter.scala b/maven-projects/spark/graphar/src/main/scala/org/apache/graphar/graph/GraphWriter.scala
index 18e7f649f..14098da56 100644
--- a/maven-projects/spark/graphar/src/main/scala/org/apache/graphar/graph/GraphWriter.scala
+++ b/maven-projects/spark/graphar/src/main/scala/org/apache/graphar/graph/GraphWriter.scala
@@ -281,11 +281,11 @@ class GraphWriter() {
}
val vertices: scala.collection.mutable.Map[String, DataFrame] =
- scala.collection.mutable.Map[String, DataFrame]()
+ scala.collection.mutable.Map.empty
val edges: scala.collection.mutable.Map[(String, String, String), DataFrame] =
- scala.collection.mutable.Map[(String, String, String), DataFrame]()
+ scala.collection.mutable.Map.empty
val vertexNums: scala.collection.mutable.Map[String, Long] =
- scala.collection.mutable.Map[String, Long]()
+ scala.collection.mutable.Map.empty
val primaryKeys: scala.collection.mutable.Map[String, String] =
- scala.collection.mutable.Map[String, String]()
+ scala.collection.mutable.Map.empty
}
diff --git a/maven-projects/spark/graphar/src/main/scala/org/apache/graphar/writer/EdgeWriter.scala b/maven-projects/spark/graphar/src/main/scala/org/apache/graphar/writer/EdgeWriter.scala
index b1e2e6748..d007d18c2 100644
--- a/maven-projects/spark/graphar/src/main/scala/org/apache/graphar/writer/EdgeWriter.scala
+++ b/maven-projects/spark/graphar/src/main/scala/org/apache/graphar/writer/EdgeWriter.scala
@@ -18,7 +18,6 @@
*/
package org.apache.graphar.writer
-
import org.apache.graphar.util.{FileSystem, EdgeChunkPartitioner}
import org.apache.graphar.{
GeneralParams,
@@ -160,9 +159,9 @@ object EdgeWriter {
val filterRDD = edgeCountsByPrimaryKey
.filter(v => v._1 / vertexChunkSize == i)
.map { case (k, v) => (k - i * vertexChunkSize + 1, v) }
- val initRDD = spark.sparkContext.parallelize(
- (0L to vertexChunkSize).map(key => (key, 0))
- )
+ val initRDD = spark.sparkContext
+ .range(0L, vertexChunkSize + 1)
+ .map(key => (key, 0))
val unionRDD = spark.sparkContext
.union(filterRDD, initRDD)
.reduceByKey(_ + _)
@@ -353,7 +352,8 @@ class EdgeWriter(
val property = pIter.next()
propertyList += "`" + property.getName() + "`"
}
- val propertyGroupDf = edgeDfAndOffsetDf._1.select(propertyList.map(col): _*)
+ val propertyGroupDf =
+ edgeDfAndOffsetDf._1.select(propertyList.map(col).toSeq: _*)
val outputPrefix =
prefix + edgeInfo.getPropertyGroupPathPrefix(propertyGroup, adjListType)
FileSystem.writeDataFrame(
diff --git a/maven-projects/spark/graphar/src/main/scala/org/apache/graphar/writer/VertexWriter.scala b/maven-projects/spark/graphar/src/main/scala/org/apache/graphar/writer/VertexWriter.scala
index d6e8483fd..dda261146 100644
--- a/maven-projects/spark/graphar/src/main/scala/org/apache/graphar/writer/VertexWriter.scala
+++ b/maven-projects/spark/graphar/src/main/scala/org/apache/graphar/writer/VertexWriter.scala
@@ -137,7 +137,7 @@ class VertexWriter(
val property = it.next()
property_list += "`" + property.getName() + "`"
}
- val pg_df = chunks.select(property_list.map(col): _*)
+ val pg_df = chunks.select(property_list.map(col).toSeq: _*)
FileSystem.writeDataFrame(
pg_df,
propertyGroup.getFile_type(),
diff --git a/maven-projects/spark/graphar/src/test/scala/org/apache/graphar/TestReader.scala b/maven-projects/spark/graphar/src/test/scala/org/apache/graphar/TestReader.scala
index f61710b95..9c52f75ee 100644
--- a/maven-projects/spark/graphar/src/test/scala/org/apache/graphar/TestReader.scala
+++ b/maven-projects/spark/graphar/src/test/scala/org/apache/graphar/TestReader.scala
@@ -20,6 +20,7 @@
package org.apache.graphar
import org.apache.graphar.reader.{VertexReader, EdgeReader}
+import org.scalatest.matchers.should.Matchers._
class ReaderSuite extends BaseTestSuite {
@@ -57,7 +58,9 @@ class ReaderSuite extends BaseTestSuite {
.format("org.apache.graphar.datasources.GarDataSource")
.load(orc_read_path)
// validate reading results
- assert(df2.rdd.collect().deep == df1.rdd.collect().deep)
+ val left = df2.rdd.collect()
+ val right = df1.rdd.collect()
+ left should contain theSameElementsAs right
df_pd = df1.filter(cond)
/**
diff --git a/maven-projects/spark/pom.xml b/maven-projects/spark/pom.xml
index 455fb1754..72e6c0854 100644
--- a/maven-projects/spark/pom.xml
+++ b/maven-projects/spark/pom.xml
@@ -35,48 +35,67 @@
pom
${graphar.version}
+
+ 2.12.15
+ 2.12
+ 512m
+ 1024m
+ 1.8
+ 1.8
+ 4.8.15
+ UTF-8
+ UTF-8
+ 2.15.4
+
+
datasources-32
graphar
- UTF-8
- UTF-8
- 2.12.10
- 2.12
- 512m
- 1024m
- 3.2.2
- 1.8
- 1.8
+ 3.2.4
graphar
datasources-32
-
- true
-
datasources-33
graphar
- UTF-8
- UTF-8
- 2.12.12
- 2.12
- 512m
- 1024m
3.3.4
- 1.8
- 1.8
graphar
datasources-33
+
+ datasources-34
+
+ graphar
+ 3.4.3
+
+
+ graphar
+ datasources-34
+
+
+
+ datasources-35
+
+ graphar
+ 3.5.1
+
+
+ graphar
+ datasources-35
+
+
+ true
+
+
@@ -108,7 +127,7 @@
${scala.version}
- -target:jvm-1.8
+ -target:jvm-${maven.compiler.target}
-Xss4096K
diff --git a/pyspark/Makefile b/pyspark/Makefile
index daea2b86a..ceac363df 100644
--- a/pyspark/Makefile
+++ b/pyspark/Makefile
@@ -19,7 +19,7 @@
install_test:
export JAVA_HOME=${JAVA_HOME_11_X64}
cd ../maven-projects/spark && mvn --no-transfer-progress clean package -DskipTests -Dspotless.check.skip=true && cd ../../pyspark
- export PYSPARK_HADOOP_VERSION=3.2
+ export PYSPARK_HADOOP_VERSION=3.3
poetry install --with=spark,tests
.PHONY: test
diff --git a/pyspark/poetry.lock b/pyspark/poetry.lock
index 48f533d1b..78abd7916 100644
--- a/pyspark/poetry.lock
+++ b/pyspark/poetry.lock
@@ -13,63 +13,83 @@ files = [
[[package]]
name = "coverage"
-version = "7.4.4"
+version = "7.6.1"
description = "Code coverage measurement for Python"
optional = false
python-versions = ">=3.8"
files = [
- {file = "coverage-7.4.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e0be5efd5127542ef31f165de269f77560d6cdef525fffa446de6f7e9186cfb2"},
- {file = "coverage-7.4.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ccd341521be3d1b3daeb41960ae94a5e87abe2f46f17224ba5d6f2b8398016cf"},
- {file = "coverage-7.4.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:09fa497a8ab37784fbb20ab699c246053ac294d13fc7eb40ec007a5043ec91f8"},
- {file = "coverage-7.4.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b1a93009cb80730c9bca5d6d4665494b725b6e8e157c1cb7f2db5b4b122ea562"},
- {file = "coverage-7.4.4-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:690db6517f09336559dc0b5f55342df62370a48f5469fabf502db2c6d1cffcd2"},
- {file = "coverage-7.4.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:09c3255458533cb76ef55da8cc49ffab9e33f083739c8bd4f58e79fecfe288f7"},
- {file = "coverage-7.4.4-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:8ce1415194b4a6bd0cdcc3a1dfbf58b63f910dcb7330fe15bdff542c56949f87"},
- {file = "coverage-7.4.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:b91cbc4b195444e7e258ba27ac33769c41b94967919f10037e6355e998af255c"},
- {file = "coverage-7.4.4-cp310-cp310-win32.whl", hash = "sha256:598825b51b81c808cb6f078dcb972f96af96b078faa47af7dfcdf282835baa8d"},
- {file = "coverage-7.4.4-cp310-cp310-win_amd64.whl", hash = "sha256:09ef9199ed6653989ebbcaacc9b62b514bb63ea2f90256e71fea3ed74bd8ff6f"},
- {file = "coverage-7.4.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:0f9f50e7ef2a71e2fae92774c99170eb8304e3fdf9c8c3c7ae9bab3e7229c5cf"},
- {file = "coverage-7.4.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:623512f8ba53c422fcfb2ce68362c97945095b864cda94a92edbaf5994201083"},
- {file = "coverage-7.4.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0513b9508b93da4e1716744ef6ebc507aff016ba115ffe8ecff744d1322a7b63"},
- {file = "coverage-7.4.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:40209e141059b9370a2657c9b15607815359ab3ef9918f0196b6fccce8d3230f"},
- {file = "coverage-7.4.4-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8a2b2b78c78293782fd3767d53e6474582f62443d0504b1554370bde86cc8227"},
- {file = "coverage-7.4.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:73bfb9c09951125d06ee473bed216e2c3742f530fc5acc1383883125de76d9cd"},
- {file = "coverage-7.4.4-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:1f384c3cc76aeedce208643697fb3e8437604b512255de6d18dae3f27655a384"},
- {file = "coverage-7.4.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:54eb8d1bf7cacfbf2a3186019bcf01d11c666bd495ed18717162f7eb1e9dd00b"},
- {file = "coverage-7.4.4-cp311-cp311-win32.whl", hash = "sha256:cac99918c7bba15302a2d81f0312c08054a3359eaa1929c7e4b26ebe41e9b286"},
- {file = "coverage-7.4.4-cp311-cp311-win_amd64.whl", hash = "sha256:b14706df8b2de49869ae03a5ccbc211f4041750cd4a66f698df89d44f4bd30ec"},
- {file = "coverage-7.4.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:201bef2eea65e0e9c56343115ba3814e896afe6d36ffd37bab783261db430f76"},
- {file = "coverage-7.4.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:41c9c5f3de16b903b610d09650e5e27adbfa7f500302718c9ffd1c12cf9d6818"},
- {file = "coverage-7.4.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d898fe162d26929b5960e4e138651f7427048e72c853607f2b200909794ed978"},
- {file = "coverage-7.4.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3ea79bb50e805cd6ac058dfa3b5c8f6c040cb87fe83de10845857f5535d1db70"},
- {file = "coverage-7.4.4-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ce4b94265ca988c3f8e479e741693d143026632672e3ff924f25fab50518dd51"},
- {file = "coverage-7.4.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:00838a35b882694afda09f85e469c96367daa3f3f2b097d846a7216993d37f4c"},
- {file = "coverage-7.4.4-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:fdfafb32984684eb03c2d83e1e51f64f0906b11e64482df3c5db936ce3839d48"},
- {file = "coverage-7.4.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:69eb372f7e2ece89f14751fbcbe470295d73ed41ecd37ca36ed2eb47512a6ab9"},
- {file = "coverage-7.4.4-cp312-cp312-win32.whl", hash = "sha256:137eb07173141545e07403cca94ab625cc1cc6bc4c1e97b6e3846270e7e1fea0"},
- {file = "coverage-7.4.4-cp312-cp312-win_amd64.whl", hash = "sha256:d71eec7d83298f1af3326ce0ff1d0ea83c7cb98f72b577097f9083b20bdaf05e"},
- {file = "coverage-7.4.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:d5ae728ff3b5401cc320d792866987e7e7e880e6ebd24433b70a33b643bb0384"},
- {file = "coverage-7.4.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:cc4f1358cb0c78edef3ed237ef2c86056206bb8d9140e73b6b89fbcfcbdd40e1"},
- {file = "coverage-7.4.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8130a2aa2acb8788e0b56938786c33c7c98562697bf9f4c7d6e8e5e3a0501e4a"},
- {file = "coverage-7.4.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cf271892d13e43bc2b51e6908ec9a6a5094a4df1d8af0bfc360088ee6c684409"},
- {file = "coverage-7.4.4-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a4cdc86d54b5da0df6d3d3a2f0b710949286094c3a6700c21e9015932b81447e"},
- {file = "coverage-7.4.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:ae71e7ddb7a413dd60052e90528f2f65270aad4b509563af6d03d53e979feafd"},
- {file = "coverage-7.4.4-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:38dd60d7bf242c4ed5b38e094baf6401faa114fc09e9e6632374388a404f98e7"},
- {file = "coverage-7.4.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:aa5b1c1bfc28384f1f53b69a023d789f72b2e0ab1b3787aae16992a7ca21056c"},
- {file = "coverage-7.4.4-cp38-cp38-win32.whl", hash = "sha256:dfa8fe35a0bb90382837b238fff375de15f0dcdb9ae68ff85f7a63649c98527e"},
- {file = "coverage-7.4.4-cp38-cp38-win_amd64.whl", hash = "sha256:b2991665420a803495e0b90a79233c1433d6ed77ef282e8e152a324bbbc5e0c8"},
- {file = "coverage-7.4.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:3b799445b9f7ee8bf299cfaed6f5b226c0037b74886a4e11515e569b36fe310d"},
- {file = "coverage-7.4.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b4d33f418f46362995f1e9d4f3a35a1b6322cb959c31d88ae56b0298e1c22357"},
- {file = "coverage-7.4.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aadacf9a2f407a4688d700e4ebab33a7e2e408f2ca04dbf4aef17585389eff3e"},
- {file = "coverage-7.4.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7c95949560050d04d46b919301826525597f07b33beba6187d04fa64d47ac82e"},
- {file = "coverage-7.4.4-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ff7687ca3d7028d8a5f0ebae95a6e4827c5616b31a4ee1192bdfde697db110d4"},
- {file = "coverage-7.4.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:5fc1de20b2d4a061b3df27ab9b7c7111e9a710f10dc2b84d33a4ab25065994ec"},
- {file = "coverage-7.4.4-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:c74880fc64d4958159fbd537a091d2a585448a8f8508bf248d72112723974cbd"},
- {file = "coverage-7.4.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:742a76a12aa45b44d236815d282b03cfb1de3b4323f3e4ec933acfae08e54ade"},
- {file = "coverage-7.4.4-cp39-cp39-win32.whl", hash = "sha256:d89d7b2974cae412400e88f35d86af72208e1ede1a541954af5d944a8ba46c57"},
- {file = "coverage-7.4.4-cp39-cp39-win_amd64.whl", hash = "sha256:9ca28a302acb19b6af89e90f33ee3e1906961f94b54ea37de6737b7ca9d8827c"},
- {file = "coverage-7.4.4-pp38.pp39.pp310-none-any.whl", hash = "sha256:b2c5edc4ac10a7ef6605a966c58929ec6c1bd0917fb8c15cb3363f65aa40e677"},
- {file = "coverage-7.4.4.tar.gz", hash = "sha256:c901df83d097649e257e803be22592aedfd5182f07b3cc87d640bbb9afd50f49"},
+ {file = "coverage-7.6.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b06079abebbc0e89e6163b8e8f0e16270124c154dc6e4a47b413dd538859af16"},
+ {file = "coverage-7.6.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:cf4b19715bccd7ee27b6b120e7e9dd56037b9c0681dcc1adc9ba9db3d417fa36"},
+ {file = "coverage-7.6.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e61c0abb4c85b095a784ef23fdd4aede7a2628478e7baba7c5e3deba61070a02"},
+ {file = "coverage-7.6.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fd21f6ae3f08b41004dfb433fa895d858f3f5979e7762d052b12aef444e29afc"},
+ {file = "coverage-7.6.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f59d57baca39b32db42b83b2a7ba6f47ad9c394ec2076b084c3f029b7afca23"},
+ {file = "coverage-7.6.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:a1ac0ae2b8bd743b88ed0502544847c3053d7171a3cff9228af618a068ed9c34"},
+ {file = "coverage-7.6.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:e6a08c0be454c3b3beb105c0596ebdc2371fab6bb90c0c0297f4e58fd7e1012c"},
+ {file = "coverage-7.6.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:f5796e664fe802da4f57a168c85359a8fbf3eab5e55cd4e4569fbacecc903959"},
+ {file = "coverage-7.6.1-cp310-cp310-win32.whl", hash = "sha256:7bb65125fcbef8d989fa1dd0e8a060999497629ca5b0efbca209588a73356232"},
+ {file = "coverage-7.6.1-cp310-cp310-win_amd64.whl", hash = "sha256:3115a95daa9bdba70aea750db7b96b37259a81a709223c8448fa97727d546fe0"},
+ {file = "coverage-7.6.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:7dea0889685db8550f839fa202744652e87c60015029ce3f60e006f8c4462c93"},
+ {file = "coverage-7.6.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ed37bd3c3b063412f7620464a9ac1314d33100329f39799255fb8d3027da50d3"},
+ {file = "coverage-7.6.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d85f5e9a5f8b73e2350097c3756ef7e785f55bd71205defa0bfdaf96c31616ff"},
+ {file = "coverage-7.6.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9bc572be474cafb617672c43fe989d6e48d3c83af02ce8de73fff1c6bb3c198d"},
+ {file = "coverage-7.6.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0c0420b573964c760df9e9e86d1a9a622d0d27f417e1a949a8a66dd7bcee7bc6"},
+ {file = "coverage-7.6.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1f4aa8219db826ce6be7099d559f8ec311549bfc4046f7f9fe9b5cea5c581c56"},
+ {file = "coverage-7.6.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:fc5a77d0c516700ebad189b587de289a20a78324bc54baee03dd486f0855d234"},
+ {file = "coverage-7.6.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:b48f312cca9621272ae49008c7f613337c53fadca647d6384cc129d2996d1133"},
+ {file = "coverage-7.6.1-cp311-cp311-win32.whl", hash = "sha256:1125ca0e5fd475cbbba3bb67ae20bd2c23a98fac4e32412883f9bcbaa81c314c"},
+ {file = "coverage-7.6.1-cp311-cp311-win_amd64.whl", hash = "sha256:8ae539519c4c040c5ffd0632784e21b2f03fc1340752af711f33e5be83a9d6c6"},
+ {file = "coverage-7.6.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:95cae0efeb032af8458fc27d191f85d1717b1d4e49f7cb226cf526ff28179778"},
+ {file = "coverage-7.6.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:5621a9175cf9d0b0c84c2ef2b12e9f5f5071357c4d2ea6ca1cf01814f45d2391"},
+ {file = "coverage-7.6.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:260933720fdcd75340e7dbe9060655aff3af1f0c5d20f46b57f262ab6c86a5e8"},
+ {file = "coverage-7.6.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:07e2ca0ad381b91350c0ed49d52699b625aab2b44b65e1b4e02fa9df0e92ad2d"},
+ {file = "coverage-7.6.1-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c44fee9975f04b33331cb8eb272827111efc8930cfd582e0320613263ca849ca"},
+ {file = "coverage-7.6.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:877abb17e6339d96bf08e7a622d05095e72b71f8afd8a9fefc82cf30ed944163"},
+ {file = "coverage-7.6.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:3e0cadcf6733c09154b461f1ca72d5416635e5e4ec4e536192180d34ec160f8a"},
+ {file = "coverage-7.6.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:c3c02d12f837d9683e5ab2f3d9844dc57655b92c74e286c262e0fc54213c216d"},
+ {file = "coverage-7.6.1-cp312-cp312-win32.whl", hash = "sha256:e05882b70b87a18d937ca6768ff33cc3f72847cbc4de4491c8e73880766718e5"},
+ {file = "coverage-7.6.1-cp312-cp312-win_amd64.whl", hash = "sha256:b5d7b556859dd85f3a541db6a4e0167b86e7273e1cdc973e5b175166bb634fdb"},
+ {file = "coverage-7.6.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:a4acd025ecc06185ba2b801f2de85546e0b8ac787cf9d3b06e7e2a69f925b106"},
+ {file = "coverage-7.6.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a6d3adcf24b624a7b778533480e32434a39ad8fa30c315208f6d3e5542aeb6e9"},
+ {file = "coverage-7.6.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d0c212c49b6c10e6951362f7c6df3329f04c2b1c28499563d4035d964ab8e08c"},
+ {file = "coverage-7.6.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6e81d7a3e58882450ec4186ca59a3f20a5d4440f25b1cff6f0902ad890e6748a"},
+ {file = "coverage-7.6.1-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:78b260de9790fd81e69401c2dc8b17da47c8038176a79092a89cb2b7d945d060"},
+ {file = "coverage-7.6.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a78d169acd38300060b28d600344a803628c3fd585c912cacc9ea8790fe96862"},
+ {file = "coverage-7.6.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:2c09f4ce52cb99dd7505cd0fc8e0e37c77b87f46bc9c1eb03fe3bc9991085388"},
+ {file = "coverage-7.6.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6878ef48d4227aace338d88c48738a4258213cd7b74fd9a3d4d7582bb1d8a155"},
+ {file = "coverage-7.6.1-cp313-cp313-win32.whl", hash = "sha256:44df346d5215a8c0e360307d46ffaabe0f5d3502c8a1cefd700b34baf31d411a"},
+ {file = "coverage-7.6.1-cp313-cp313-win_amd64.whl", hash = "sha256:8284cf8c0dd272a247bc154eb6c95548722dce90d098c17a883ed36e67cdb129"},
+ {file = "coverage-7.6.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:d3296782ca4eab572a1a4eca686d8bfb00226300dcefdf43faa25b5242ab8a3e"},
+ {file = "coverage-7.6.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:502753043567491d3ff6d08629270127e0c31d4184c4c8d98f92c26f65019962"},
+ {file = "coverage-7.6.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6a89ecca80709d4076b95f89f308544ec8f7b4727e8a547913a35f16717856cb"},
+ {file = "coverage-7.6.1-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a318d68e92e80af8b00fa99609796fdbcdfef3629c77c6283566c6f02c6d6704"},
+ {file = "coverage-7.6.1-cp313-cp313t-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:13b0a73a0896988f053e4fbb7de6d93388e6dd292b0d87ee51d106f2c11b465b"},
+ {file = "coverage-7.6.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:4421712dbfc5562150f7554f13dde997a2e932a6b5f352edcce948a815efee6f"},
+ {file = "coverage-7.6.1-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:166811d20dfea725e2e4baa71fffd6c968a958577848d2131f39b60043400223"},
+ {file = "coverage-7.6.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:225667980479a17db1048cb2bf8bfb39b8e5be8f164b8f6628b64f78a72cf9d3"},
+ {file = "coverage-7.6.1-cp313-cp313t-win32.whl", hash = "sha256:170d444ab405852903b7d04ea9ae9b98f98ab6d7e63e1115e82620807519797f"},
+ {file = "coverage-7.6.1-cp313-cp313t-win_amd64.whl", hash = "sha256:b9f222de8cded79c49bf184bdbc06630d4c58eec9459b939b4a690c82ed05657"},
+ {file = "coverage-7.6.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:6db04803b6c7291985a761004e9060b2bca08da6d04f26a7f2294b8623a0c1a0"},
+ {file = "coverage-7.6.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:f1adfc8ac319e1a348af294106bc6a8458a0f1633cc62a1446aebc30c5fa186a"},
+ {file = "coverage-7.6.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a95324a9de9650a729239daea117df21f4b9868ce32e63f8b650ebe6cef5595b"},
+ {file = "coverage-7.6.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b43c03669dc4618ec25270b06ecd3ee4fa94c7f9b3c14bae6571ca00ef98b0d3"},
+ {file = "coverage-7.6.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8929543a7192c13d177b770008bc4e8119f2e1f881d563fc6b6305d2d0ebe9de"},
+ {file = "coverage-7.6.1-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:a09ece4a69cf399510c8ab25e0950d9cf2b42f7b3cb0374f95d2e2ff594478a6"},
+ {file = "coverage-7.6.1-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:9054a0754de38d9dbd01a46621636689124d666bad1936d76c0341f7d71bf569"},
+ {file = "coverage-7.6.1-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:0dbde0f4aa9a16fa4d754356a8f2e36296ff4d83994b2c9d8398aa32f222f989"},
+ {file = "coverage-7.6.1-cp38-cp38-win32.whl", hash = "sha256:da511e6ad4f7323ee5702e6633085fb76c2f893aaf8ce4c51a0ba4fc07580ea7"},
+ {file = "coverage-7.6.1-cp38-cp38-win_amd64.whl", hash = "sha256:3f1156e3e8f2872197af3840d8ad307a9dd18e615dc64d9ee41696f287c57ad8"},
+ {file = "coverage-7.6.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:abd5fd0db5f4dc9289408aaf34908072f805ff7792632250dcb36dc591d24255"},
+ {file = "coverage-7.6.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:547f45fa1a93154bd82050a7f3cddbc1a7a4dd2a9bf5cb7d06f4ae29fe94eaf8"},
+ {file = "coverage-7.6.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:645786266c8f18a931b65bfcefdbf6952dd0dea98feee39bd188607a9d307ed2"},
+ {file = "coverage-7.6.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9e0b2df163b8ed01d515807af24f63de04bebcecbd6c3bfeff88385789fdf75a"},
+ {file = "coverage-7.6.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:609b06f178fe8e9f89ef676532760ec0b4deea15e9969bf754b37f7c40326dbc"},
+ {file = "coverage-7.6.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:702855feff378050ae4f741045e19a32d57d19f3e0676d589df0575008ea5004"},
+ {file = "coverage-7.6.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:2bdb062ea438f22d99cba0d7829c2ef0af1d768d1e4a4f528087224c90b132cb"},
+ {file = "coverage-7.6.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:9c56863d44bd1c4fe2abb8a4d6f5371d197f1ac0ebdee542f07f35895fc07f36"},
+ {file = "coverage-7.6.1-cp39-cp39-win32.whl", hash = "sha256:6e2cd258d7d927d09493c8df1ce9174ad01b381d4729a9d8d4e38670ca24774c"},
+ {file = "coverage-7.6.1-cp39-cp39-win_amd64.whl", hash = "sha256:06a737c882bd26d0d6ee7269b20b12f14a8704807a01056c80bb881a4b2ce6ca"},
+ {file = "coverage-7.6.1-pp38.pp39.pp310-none-any.whl", hash = "sha256:e9a6e0eb86070e8ccaedfbd9d38fec54864f3125ab95419970575b42af7541df"},
+ {file = "coverage-7.6.1.tar.gz", hash = "sha256:953510dfb7b12ab69d20135a0662397f077c59b1e6379a768e97c59d852ee51d"},
]
[package.dependencies]
@@ -80,13 +100,13 @@ toml = ["tomli"]
[[package]]
name = "exceptiongroup"
-version = "1.2.0"
+version = "1.2.2"
description = "Backport of PEP 654 (exception groups)"
optional = false
python-versions = ">=3.7"
files = [
- {file = "exceptiongroup-1.2.0-py3-none-any.whl", hash = "sha256:4bfd3996ac73b41e9b9628b04e079f193850720ea5945fc96a08633c66912f14"},
- {file = "exceptiongroup-1.2.0.tar.gz", hash = "sha256:91f5c769735f051a4290d52edd0858999b57e5876e9f85937691bd4c9fa3ed68"},
+ {file = "exceptiongroup-1.2.2-py3-none-any.whl", hash = "sha256:3111b9d131c238bec2f8f516e123e14ba243563fb135d3fe885990585aa7795b"},
+ {file = "exceptiongroup-1.2.2.tar.gz", hash = "sha256:47c2edf7c6738fafb49fd34290706d1a1a2f4d1c6df275526b62cbb4aa5393cc"},
]
[package.extras]
@@ -105,13 +125,13 @@ files = [
[[package]]
name = "jinja2"
-version = "3.1.3"
+version = "3.1.4"
description = "A very fast and expressive template engine."
optional = false
python-versions = ">=3.7"
files = [
- {file = "Jinja2-3.1.3-py3-none-any.whl", hash = "sha256:7d6d50dd97d52cbc355597bd845fabfbac3f551e1f99619e39a35ce8c370b5fa"},
- {file = "Jinja2-3.1.3.tar.gz", hash = "sha256:ac8bd6544d4bb2c9792bf3a159e80bba8fda7f07e81bc3aed565432d5925ba90"},
+ {file = "jinja2-3.1.4-py3-none-any.whl", hash = "sha256:bc5dd2abb727a5319567b7a813e6a2e7318c39f4f487cfe6c89c6f9c7d25197d"},
+ {file = "jinja2-3.1.4.tar.gz", hash = "sha256:4a3aee7acbbe7303aede8e9648d13b8bf88a429282aa6122a993f0ac800cb369"},
]
[package.dependencies]
@@ -191,24 +211,24 @@ files = [
[[package]]
name = "packaging"
-version = "24.0"
+version = "24.1"
description = "Core utilities for Python packages"
optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
files = [
- {file = "packaging-24.0-py3-none-any.whl", hash = "sha256:2ddfb553fdf02fb784c234c7ba6ccc288296ceabec964ad2eae3777778130bc5"},
- {file = "packaging-24.0.tar.gz", hash = "sha256:eb82c5e3e56209074766e6885bb04b8c38a0c015d0a30036ebe7ece34c9989e9"},
+ {file = "packaging-24.1-py3-none-any.whl", hash = "sha256:5b8f2217dbdbd2f7f384c41c628544e6d52f2d0f53c6d0c3ea61aa5d1d7ff124"},
+ {file = "packaging-24.1.tar.gz", hash = "sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002"},
]
[[package]]
name = "pdoc"
-version = "14.4.0"
+version = "14.6.0"
description = "API Documentation for Python Projects"
optional = false
python-versions = ">=3.8"
files = [
- {file = "pdoc-14.4.0-py3-none-any.whl", hash = "sha256:6ea4fe07620b1f7601e2708a307a257636ec206e20b5611640b30f2e3cab47d6"},
- {file = "pdoc-14.4.0.tar.gz", hash = "sha256:c92edc425429ccbe287ace2a027953c24f13de53eab484c1a6d31ca72dd2fda9"},
+ {file = "pdoc-14.6.0-py3-none-any.whl", hash = "sha256:36c42c546a317d8e3e8c0b39645f24161374de0c7066ccaae76628d721e49ba5"},
+ {file = "pdoc-14.6.0.tar.gz", hash = "sha256:6e98a24c5e0ca5d188397969cf82581836eaef13f172fc3820047bfe15c61c9a"},
]
[package.dependencies]
@@ -221,13 +241,13 @@ dev = ["hypothesis", "mypy", "pdoc-pyo3-sample-library (==1.0.11)", "pygments (>
[[package]]
name = "pluggy"
-version = "1.4.0"
+version = "1.5.0"
description = "plugin and hook calling mechanisms for python"
optional = false
python-versions = ">=3.8"
files = [
- {file = "pluggy-1.4.0-py3-none-any.whl", hash = "sha256:7db9f7b503d67d1c5b95f59773ebb58a8c1c288129a88665838012cfb07b8981"},
- {file = "pluggy-1.4.0.tar.gz", hash = "sha256:8c85c2876142a764e5b7548e7d9a0e0ddb46f5185161049a79b7e974454223be"},
+ {file = "pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669"},
+ {file = "pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1"},
]
[package.extras]
@@ -236,58 +256,58 @@ testing = ["pytest", "pytest-benchmark"]
[[package]]
name = "py4j"
-version = "0.10.9.5"
+version = "0.10.9.7"
description = "Enables Python programs to dynamically access arbitrary Java objects"
optional = false
python-versions = "*"
files = [
- {file = "py4j-0.10.9.5-py2.py3-none-any.whl", hash = "sha256:52d171a6a2b031d8a5d1de6efe451cf4f5baff1a2819aabc3741c8406539ba04"},
- {file = "py4j-0.10.9.5.tar.gz", hash = "sha256:276a4a3c5a2154df1860ef3303a927460e02e97b047dc0a47c1c3fb8cce34db6"},
+ {file = "py4j-0.10.9.7-py2.py3-none-any.whl", hash = "sha256:85defdfd2b2376eb3abf5ca6474b51ab7e0de341c75a02f46dc9b5976f5a5c1b"},
+ {file = "py4j-0.10.9.7.tar.gz", hash = "sha256:0b6e5315bb3ada5cf62ac651d107bb2ebc02def3dee9d9548e3baac644ea8dbb"},
]
[[package]]
name = "pygments"
-version = "2.17.2"
+version = "2.18.0"
description = "Pygments is a syntax highlighting package written in Python."
optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
files = [
- {file = "pygments-2.17.2-py3-none-any.whl", hash = "sha256:b27c2826c47d0f3219f29554824c30c5e8945175d888647acd804ddd04af846c"},
- {file = "pygments-2.17.2.tar.gz", hash = "sha256:da46cec9fd2de5be3a8a784f434e4c4ab670b4ff54d605c4c2717e9d49c4c367"},
+ {file = "pygments-2.18.0-py3-none-any.whl", hash = "sha256:b8e6aca0523f3ab76fee51799c488e38782ac06eafcf95e7ba832985c8e7b13a"},
+ {file = "pygments-2.18.0.tar.gz", hash = "sha256:786ff802f32e91311bff3889f6e9a86e81505fe99f2735bb6d60ae0c5004f199"},
]
[package.extras]
-plugins = ["importlib-metadata"]
windows-terminal = ["colorama (>=0.4.6)"]
[[package]]
name = "pyspark"
-version = "3.2.2"
+version = "3.5.1"
description = "Apache Spark Python API"
optional = false
-python-versions = ">=3.6"
+python-versions = ">=3.8"
files = [
- {file = "pyspark-3.2.2.tar.gz", hash = "sha256:5455214cf0b83d4a184cda25ca3b0812481915353b180cf7d7ac227728a4d99e"},
+ {file = "pyspark-3.5.1.tar.gz", hash = "sha256:dd6569e547365eadc4f887bf57f153e4d582a68c4b490de475d55b9981664910"},
]
[package.dependencies]
-py4j = "0.10.9.5"
+py4j = "0.10.9.7"
[package.extras]
-ml = ["numpy (>=1.7)"]
-mllib = ["numpy (>=1.7)"]
-pandas-on-spark = ["numpy (>=1.14)", "pandas (>=0.23.2)", "pyarrow (>=1.0.0)"]
-sql = ["pandas (>=0.23.2)", "pyarrow (>=1.0.0)"]
+connect = ["googleapis-common-protos (>=1.56.4)", "grpcio (>=1.56.0)", "grpcio-status (>=1.56.0)", "numpy (>=1.15)", "pandas (>=1.0.5)", "pyarrow (>=4.0.0)"]
+ml = ["numpy (>=1.15)"]
+mllib = ["numpy (>=1.15)"]
+pandas-on-spark = ["numpy (>=1.15)", "pandas (>=1.0.5)", "pyarrow (>=4.0.0)"]
+sql = ["numpy (>=1.15)", "pandas (>=1.0.5)", "pyarrow (>=4.0.0)"]
[[package]]
name = "pytest"
-version = "8.1.1"
+version = "8.3.2"
description = "pytest: simple powerful testing with Python"
optional = false
python-versions = ">=3.8"
files = [
- {file = "pytest-8.1.1-py3-none-any.whl", hash = "sha256:2a8386cfc11fa9d2c50ee7b2a57e7d898ef90470a7a34c4b949ff59662bb78b7"},
- {file = "pytest-8.1.1.tar.gz", hash = "sha256:ac978141a75948948817d360297b7aae0fcb9d6ff6bc9ec6d514b85d5a65c044"},
+ {file = "pytest-8.3.2-py3-none-any.whl", hash = "sha256:4ba08f9ae7dcf84ded419494d229b48d0903ea6407b030eaec46df5e6a73bba5"},
+ {file = "pytest-8.3.2.tar.gz", hash = "sha256:c132345d12ce551242c87269de812483f5bcc87cdbb4722e48487ba194f9fdce"},
]
[package.dependencies]
@@ -295,11 +315,11 @@ colorama = {version = "*", markers = "sys_platform == \"win32\""}
exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""}
iniconfig = "*"
packaging = "*"
-pluggy = ">=1.4,<2.0"
+pluggy = ">=1.5,<2"
tomli = {version = ">=1", markers = "python_version < \"3.11\""}
[package.extras]
-testing = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
+dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
[[package]]
name = "pytest-cov"
@@ -321,88 +341,91 @@ testing = ["fields", "hunter", "process-tests", "pytest-xdist", "virtualenv"]
[[package]]
name = "pyyaml"
-version = "6.0.1"
+version = "6.0.2"
description = "YAML parser and emitter for Python"
optional = false
-python-versions = ">=3.6"
+python-versions = ">=3.8"
files = [
- {file = "PyYAML-6.0.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d858aa552c999bc8a8d57426ed01e40bef403cd8ccdd0fc5f6f04a00414cac2a"},
- {file = "PyYAML-6.0.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:fd66fc5d0da6d9815ba2cebeb4205f95818ff4b79c3ebe268e75d961704af52f"},
- {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938"},
- {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d"},
- {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515"},
- {file = "PyYAML-6.0.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290"},
- {file = "PyYAML-6.0.1-cp310-cp310-win32.whl", hash = "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924"},
- {file = "PyYAML-6.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d"},
- {file = "PyYAML-6.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007"},
- {file = "PyYAML-6.0.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f003ed9ad21d6a4713f0a9b5a7a0a79e08dd0f221aff4525a2be4c346ee60aab"},
- {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d"},
- {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc"},
- {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673"},
- {file = "PyYAML-6.0.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b"},
- {file = "PyYAML-6.0.1-cp311-cp311-win32.whl", hash = "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741"},
- {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
- {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
- {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
- {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"},
- {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
- {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
- {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
- {file = "PyYAML-6.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df"},
- {file = "PyYAML-6.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47"},
- {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98"},
- {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c"},
- {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:afd7e57eddb1a54f0f1a974bc4391af8bcce0b444685d936840f125cf046d5bd"},
- {file = "PyYAML-6.0.1-cp36-cp36m-win32.whl", hash = "sha256:fca0e3a251908a499833aa292323f32437106001d436eca0e6e7833256674585"},
- {file = "PyYAML-6.0.1-cp36-cp36m-win_amd64.whl", hash = "sha256:f22ac1c3cac4dbc50079e965eba2c1058622631e526bd9afd45fedd49ba781fa"},
- {file = "PyYAML-6.0.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:b1275ad35a5d18c62a7220633c913e1b42d44b46ee12554e5fd39c70a243d6a3"},
- {file = "PyYAML-6.0.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:18aeb1bf9a78867dc38b259769503436b7c72f7a1f1f4c93ff9a17de54319b27"},
- {file = "PyYAML-6.0.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:596106435fa6ad000c2991a98fa58eeb8656ef2325d7e158344fb33864ed87e3"},
- {file = "PyYAML-6.0.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:baa90d3f661d43131ca170712d903e6295d1f7a0f595074f151c0aed377c9b9c"},
- {file = "PyYAML-6.0.1-cp37-cp37m-win32.whl", hash = "sha256:9046c58c4395dff28dd494285c82ba00b546adfc7ef001486fbf0324bc174fba"},
- {file = "PyYAML-6.0.1-cp37-cp37m-win_amd64.whl", hash = "sha256:4fb147e7a67ef577a588a0e2c17b6db51dda102c71de36f8549b6816a96e1867"},
- {file = "PyYAML-6.0.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1d4c7e777c441b20e32f52bd377e0c409713e8bb1386e1099c2415f26e479595"},
- {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5"},
- {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696"},
- {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735"},
- {file = "PyYAML-6.0.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6"},
- {file = "PyYAML-6.0.1-cp38-cp38-win32.whl", hash = "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206"},
- {file = "PyYAML-6.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62"},
- {file = "PyYAML-6.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8"},
- {file = "PyYAML-6.0.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c8098ddcc2a85b61647b2590f825f3db38891662cfc2fc776415143f599bb859"},
- {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6"},
- {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0"},
- {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c"},
- {file = "PyYAML-6.0.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5"},
- {file = "PyYAML-6.0.1-cp39-cp39-win32.whl", hash = "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c"},
- {file = "PyYAML-6.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486"},
- {file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"},
+ {file = "PyYAML-6.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0a9a2848a5b7feac301353437eb7d5957887edbf81d56e903999a75a3d743086"},
+ {file = "PyYAML-6.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:29717114e51c84ddfba879543fb232a6ed60086602313ca38cce623c1d62cfbf"},
+ {file = "PyYAML-6.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8824b5a04a04a047e72eea5cec3bc266db09e35de6bdfe34c9436ac5ee27d237"},
+ {file = "PyYAML-6.0.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7c36280e6fb8385e520936c3cb3b8042851904eba0e58d277dca80a5cfed590b"},
+ {file = "PyYAML-6.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec031d5d2feb36d1d1a24380e4db6d43695f3748343d99434e6f5f9156aaa2ed"},
+ {file = "PyYAML-6.0.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:936d68689298c36b53b29f23c6dbb74de12b4ac12ca6cfe0e047bedceea56180"},
+ {file = "PyYAML-6.0.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:23502f431948090f597378482b4812b0caae32c22213aecf3b55325e049a6c68"},
+ {file = "PyYAML-6.0.2-cp310-cp310-win32.whl", hash = "sha256:2e99c6826ffa974fe6e27cdb5ed0021786b03fc98e5ee3c5bfe1fd5015f42b99"},
+ {file = "PyYAML-6.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:a4d3091415f010369ae4ed1fc6b79def9416358877534caf6a0fdd2146c87a3e"},
+ {file = "PyYAML-6.0.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:cc1c1159b3d456576af7a3e4d1ba7e6924cb39de8f67111c735f6fc832082774"},
+ {file = "PyYAML-6.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1e2120ef853f59c7419231f3bf4e7021f1b936f6ebd222406c3b60212205d2ee"},
+ {file = "PyYAML-6.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5d225db5a45f21e78dd9358e58a98702a0302f2659a3c6cd320564b75b86f47c"},
+ {file = "PyYAML-6.0.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5ac9328ec4831237bec75defaf839f7d4564be1e6b25ac710bd1a96321cc8317"},
+ {file = "PyYAML-6.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ad2a3decf9aaba3d29c8f537ac4b243e36bef957511b4766cb0057d32b0be85"},
+ {file = "PyYAML-6.0.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:ff3824dc5261f50c9b0dfb3be22b4567a6f938ccce4587b38952d85fd9e9afe4"},
+ {file = "PyYAML-6.0.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:797b4f722ffa07cc8d62053e4cff1486fa6dc094105d13fea7b1de7d8bf71c9e"},
+ {file = "PyYAML-6.0.2-cp311-cp311-win32.whl", hash = "sha256:11d8f3dd2b9c1207dcaf2ee0bbbfd5991f571186ec9cc78427ba5bd32afae4b5"},
+ {file = "PyYAML-6.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:e10ce637b18caea04431ce14fabcf5c64a1c61ec9c56b071a4b7ca131ca52d44"},
+ {file = "PyYAML-6.0.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:c70c95198c015b85feafc136515252a261a84561b7b1d51e3384e0655ddf25ab"},
+ {file = "PyYAML-6.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ce826d6ef20b1bc864f0a68340c8b3287705cae2f8b4b1d932177dcc76721725"},
+ {file = "PyYAML-6.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f71ea527786de97d1a0cc0eacd1defc0985dcf6b3f17bb77dcfc8c34bec4dc5"},
+ {file = "PyYAML-6.0.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9b22676e8097e9e22e36d6b7bda33190d0d400f345f23d4065d48f4ca7ae0425"},
+ {file = "PyYAML-6.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80bab7bfc629882493af4aa31a4cfa43a4c57c83813253626916b8c7ada83476"},
+ {file = "PyYAML-6.0.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:0833f8694549e586547b576dcfaba4a6b55b9e96098b36cdc7ebefe667dfed48"},
+ {file = "PyYAML-6.0.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8b9c7197f7cb2738065c481a0461e50ad02f18c78cd75775628afb4d7137fb3b"},
+ {file = "PyYAML-6.0.2-cp312-cp312-win32.whl", hash = "sha256:ef6107725bd54b262d6dedcc2af448a266975032bc85ef0172c5f059da6325b4"},
+ {file = "PyYAML-6.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:7e7401d0de89a9a855c839bc697c079a4af81cf878373abd7dc625847d25cbd8"},
+ {file = "PyYAML-6.0.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:efdca5630322a10774e8e98e1af481aad470dd62c3170801852d752aa7a783ba"},
+ {file = "PyYAML-6.0.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:50187695423ffe49e2deacb8cd10510bc361faac997de9efef88badc3bb9e2d1"},
+ {file = "PyYAML-6.0.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0ffe8360bab4910ef1b9e87fb812d8bc0a308b0d0eef8c8f44e0254ab3b07133"},
+ {file = "PyYAML-6.0.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:17e311b6c678207928d649faa7cb0d7b4c26a0ba73d41e99c4fff6b6c3276484"},
+ {file = "PyYAML-6.0.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:70b189594dbe54f75ab3a1acec5f1e3faa7e8cf2f1e08d9b561cb41b845f69d5"},
+ {file = "PyYAML-6.0.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:41e4e3953a79407c794916fa277a82531dd93aad34e29c2a514c2c0c5fe971cc"},
+ {file = "PyYAML-6.0.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:68ccc6023a3400877818152ad9a1033e3db8625d899c72eacb5a668902e4d652"},
+ {file = "PyYAML-6.0.2-cp313-cp313-win32.whl", hash = "sha256:bc2fa7c6b47d6bc618dd7fb02ef6fdedb1090ec036abab80d4681424b84c1183"},
+ {file = "PyYAML-6.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:8388ee1976c416731879ac16da0aff3f63b286ffdd57cdeb95f3f2e085687563"},
+ {file = "PyYAML-6.0.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:24471b829b3bf607e04e88d79542a9d48bb037c2267d7927a874e6c205ca7e9a"},
+ {file = "PyYAML-6.0.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d7fded462629cfa4b685c5416b949ebad6cec74af5e2d42905d41e257e0869f5"},
+ {file = "PyYAML-6.0.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d84a1718ee396f54f3a086ea0a66d8e552b2ab2017ef8b420e92edbc841c352d"},
+ {file = "PyYAML-6.0.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9056c1ecd25795207ad294bcf39f2db3d845767be0ea6e6a34d856f006006083"},
+ {file = "PyYAML-6.0.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:82d09873e40955485746739bcb8b4586983670466c23382c19cffecbf1fd8706"},
+ {file = "PyYAML-6.0.2-cp38-cp38-win32.whl", hash = "sha256:43fa96a3ca0d6b1812e01ced1044a003533c47f6ee8aca31724f78e93ccc089a"},
+ {file = "PyYAML-6.0.2-cp38-cp38-win_amd64.whl", hash = "sha256:01179a4a8559ab5de078078f37e5c1a30d76bb88519906844fd7bdea1b7729ff"},
+ {file = "PyYAML-6.0.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:688ba32a1cffef67fd2e9398a2efebaea461578b0923624778664cc1c914db5d"},
+ {file = "PyYAML-6.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a8786accb172bd8afb8be14490a16625cbc387036876ab6ba70912730faf8e1f"},
+ {file = "PyYAML-6.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d8e03406cac8513435335dbab54c0d385e4a49e4945d2909a581c83647ca0290"},
+ {file = "PyYAML-6.0.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f753120cb8181e736c57ef7636e83f31b9c0d1722c516f7e86cf15b7aa57ff12"},
+ {file = "PyYAML-6.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3b1fdb9dc17f5a7677423d508ab4f243a726dea51fa5e70992e59a7411c89d19"},
+ {file = "PyYAML-6.0.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:0b69e4ce7a131fe56b7e4d770c67429700908fc0752af059838b1cfb41960e4e"},
+ {file = "PyYAML-6.0.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:a9f8c2e67970f13b16084e04f134610fd1d374bf477b17ec1599185cf611d725"},
+ {file = "PyYAML-6.0.2-cp39-cp39-win32.whl", hash = "sha256:6395c297d42274772abc367baaa79683958044e5d3835486c16da75d2a694631"},
+ {file = "PyYAML-6.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:39693e1f8320ae4f43943590b49779ffb98acb81f788220ea932a6b6c51004d8"},
+ {file = "pyyaml-6.0.2.tar.gz", hash = "sha256:d584d9ec91ad65861cc08d42e834324ef890a082e591037abe114850ff7bbc3e"},
]
[[package]]
name = "ruff"
-version = "0.3.7"
+version = "0.5.7"
description = "An extremely fast Python linter and code formatter, written in Rust."
optional = false
python-versions = ">=3.7"
files = [
- {file = "ruff-0.3.7-py3-none-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:0e8377cccb2f07abd25e84fc5b2cbe48eeb0fea9f1719cad7caedb061d70e5ce"},
- {file = "ruff-0.3.7-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:15a4d1cc1e64e556fa0d67bfd388fed416b7f3b26d5d1c3e7d192c897e39ba4b"},
- {file = "ruff-0.3.7-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d28bdf3d7dc71dd46929fafeec98ba89b7c3550c3f0978e36389b5631b793663"},
- {file = "ruff-0.3.7-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:379b67d4f49774ba679593b232dcd90d9e10f04d96e3c8ce4a28037ae473f7bb"},
- {file = "ruff-0.3.7-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c060aea8ad5ef21cdfbbe05475ab5104ce7827b639a78dd55383a6e9895b7c51"},
- {file = "ruff-0.3.7-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:ebf8f615dde968272d70502c083ebf963b6781aacd3079081e03b32adfe4d58a"},
- {file = "ruff-0.3.7-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d48098bd8f5c38897b03604f5428901b65e3c97d40b3952e38637b5404b739a2"},
- {file = "ruff-0.3.7-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:da8a4fda219bf9024692b1bc68c9cff4b80507879ada8769dc7e985755d662ea"},
- {file = "ruff-0.3.7-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c44e0149f1d8b48c4d5c33d88c677a4aa22fd09b1683d6a7ff55b816b5d074f"},
- {file = "ruff-0.3.7-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:3050ec0af72b709a62ecc2aca941b9cd479a7bf2b36cc4562f0033d688e44fa1"},
- {file = "ruff-0.3.7-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:a29cc38e4c1ab00da18a3f6777f8b50099d73326981bb7d182e54a9a21bb4ff7"},
- {file = "ruff-0.3.7-py3-none-musllinux_1_2_i686.whl", hash = "sha256:5b15cc59c19edca917f51b1956637db47e200b0fc5e6e1878233d3a938384b0b"},
- {file = "ruff-0.3.7-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:e491045781b1e38b72c91247cf4634f040f8d0cb3e6d3d64d38dcf43616650b4"},
- {file = "ruff-0.3.7-py3-none-win32.whl", hash = "sha256:bc931de87593d64fad3a22e201e55ad76271f1d5bfc44e1a1887edd0903c7d9f"},
- {file = "ruff-0.3.7-py3-none-win_amd64.whl", hash = "sha256:5ef0e501e1e39f35e03c2acb1d1238c595b8bb36cf7a170e7c1df1b73da00e74"},
- {file = "ruff-0.3.7-py3-none-win_arm64.whl", hash = "sha256:789e144f6dc7019d1f92a812891c645274ed08af6037d11fc65fcbc183b7d59f"},
- {file = "ruff-0.3.7.tar.gz", hash = "sha256:d5c1aebee5162c2226784800ae031f660c350e7a3402c4d1f8ea4e97e232e3ba"},
+ {file = "ruff-0.5.7-py3-none-linux_armv6l.whl", hash = "sha256:548992d342fc404ee2e15a242cdbea4f8e39a52f2e7752d0e4cbe88d2d2f416a"},
+ {file = "ruff-0.5.7-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:00cc8872331055ee017c4f1071a8a31ca0809ccc0657da1d154a1d2abac5c0be"},
+ {file = "ruff-0.5.7-py3-none-macosx_11_0_arm64.whl", hash = "sha256:eaf3d86a1fdac1aec8a3417a63587d93f906c678bb9ed0b796da7b59c1114a1e"},
+ {file = "ruff-0.5.7-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a01c34400097b06cf8a6e61b35d6d456d5bd1ae6961542de18ec81eaf33b4cb8"},
+ {file = "ruff-0.5.7-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:fcc8054f1a717e2213500edaddcf1dbb0abad40d98e1bd9d0ad364f75c763eea"},
+ {file = "ruff-0.5.7-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7f70284e73f36558ef51602254451e50dd6cc479f8b6f8413a95fcb5db4a55fc"},
+ {file = "ruff-0.5.7-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:a78ad870ae3c460394fc95437d43deb5c04b5c29297815a2a1de028903f19692"},
+ {file = "ruff-0.5.7-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9ccd078c66a8e419475174bfe60a69adb36ce04f8d4e91b006f1329d5cd44bcf"},
+ {file = "ruff-0.5.7-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7e31c9bad4ebf8fdb77b59cae75814440731060a09a0e0077d559a556453acbb"},
+ {file = "ruff-0.5.7-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8d796327eed8e168164346b769dd9a27a70e0298d667b4ecee6877ce8095ec8e"},
+ {file = "ruff-0.5.7-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:4a09ea2c3f7778cc635e7f6edf57d566a8ee8f485f3c4454db7771efb692c499"},
+ {file = "ruff-0.5.7-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:a36d8dcf55b3a3bc353270d544fb170d75d2dff41eba5df57b4e0b67a95bb64e"},
+ {file = "ruff-0.5.7-py3-none-musllinux_1_2_i686.whl", hash = "sha256:9369c218f789eefbd1b8d82a8cf25017b523ac47d96b2f531eba73770971c9e5"},
+ {file = "ruff-0.5.7-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:b88ca3db7eb377eb24fb7c82840546fb7acef75af4a74bd36e9ceb37a890257e"},
+ {file = "ruff-0.5.7-py3-none-win32.whl", hash = "sha256:33d61fc0e902198a3e55719f4be6b375b28f860b09c281e4bdbf783c0566576a"},
+ {file = "ruff-0.5.7-py3-none-win_amd64.whl", hash = "sha256:083bbcbe6fadb93cd86709037acc510f86eed5a314203079df174c40bbbca6b3"},
+ {file = "ruff-0.5.7-py3-none-win_arm64.whl", hash = "sha256:2dca26154ff9571995107221d0aeaad0e75a77b5a682d6236cf89a58c70b76f4"},
+ {file = "ruff-0.5.7.tar.gz", hash = "sha256:8dfc0a458797f5d9fb622dd0efc52d796f23f0a1493a9527f4e49a550ae9a7e5"},
]
[[package]]
@@ -418,5 +441,5 @@ files = [
[metadata]
lock-version = "2.0"
-python-versions = "^3.9"
-content-hash = "31e8625c12ffbe3c361cd9d65293e1c1024f6b2361a8a5dc3d1f40799f9d020a"
+python-versions = "^3.10"
+content-hash = "123e743d47ce2a0da8bf405e81ee439cd40a1cc8ef65e578497416649a1b14a4"
diff --git a/pyspark/pyproject.toml b/pyspark/pyproject.toml
index 9e5aa275d..90e45d99b 100644
--- a/pyspark/pyproject.toml
+++ b/pyspark/pyproject.toml
@@ -24,13 +24,13 @@ readme = "README.md"
packages = [{include = "graphar_pyspark"}]
[tool.poetry.dependencies]
-python = "^3.9"
+python = "^3.10"
[tool.poetry.group.spark]
optional = true
[tool.poetry.group.spark.dependencies]
-pyspark = "3.2.2" # TODO: relax requirement when scala part will be available for multiple spark versions
+pyspark = "3.5.1" # TODO: relax requirement when scala part will be available for multiple spark versions
[tool.poetry.group.lint]
optional = true