diff --git a/.github/workflows/pyspark.yml b/.github/workflows/pyspark.yml index 702122c4e..8630a1ce9 100644 --- a/.github/workflows/pyspark.yml +++ b/.github/workflows/pyspark.yml @@ -50,7 +50,7 @@ jobs: - name: Install Python uses: actions/setup-python@v4 with: - python-version: 3.9 + python-version: '3.10' - name: Install Poetry working-directory: pyspark diff --git a/.github/workflows/spark.yaml b/.github/workflows/spark.yaml index 3d7dc96f4..cae0fc2dd 100644 --- a/.github/workflows/spark.yaml +++ b/.github/workflows/spark.yaml @@ -47,11 +47,17 @@ jobs: matrix: include: - mvn-profile: "datasources-32" - spark: "spark-3.2.2" - spark-hadoop: "spark-3.2.2-bin-hadoop3.2" + spark: "spark-3.2.4" + spark-hadoop: "spark-3.2.4-bin-hadoop3.2" - mvn-profile: "datasources-33" spark: "spark-3.3.4" spark-hadoop: "spark-3.3.4-bin-hadoop3" + - mvn-profile: "datasources-34" + spark: "spark-3.4.3" + spark-hadoop: "spark-3.4.3-bin-hadoop3" + - mvn-profile: "datasources-35" + spark: "spark-3.5.1" + spark-hadoop: "spark-3.5.1-bin-hadoop3" steps: - uses: actions/checkout@v4 @@ -117,7 +123,10 @@ jobs: echo "match (a) -[r] -> () delete a, r;match (a) delete a;" | cypher-shell -u ${NEO4J_USR} -p ${NEO4J_PWD} -d neo4j --format plain scripts/run-graphar2neo4j.sh + # Apache Spark version 3.4.3 is not supported by the current NebulaGraph Spark Connector. - name: Run Nebula2GraphAr example + # https://github.com/orgs/community/discussions/37883#discussioncomment-4021318 + if: ${{ matrix.spark < 'spark-3.4.3' }} working-directory: maven-projects/spark run: | export JAVA_HOME=${JAVA_HOME_11_X64} diff --git a/.gitignore b/.gitignore index 0d520ae44..08a7fbcac 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,11 @@ .DS_store .cache .ccls-cache +.dir-locals.el +.classpath +.project +.settings +.factorypath compile_commands.json diff --git a/licenserc.toml b/licenserc.toml index ed4a4c141..56db55c85 100644 --- a/licenserc.toml +++ b/licenserc.toml @@ -48,6 +48,10 @@ excludes = [ "spark/datasources-32/src/main/scala/org/apache/spark/sql/graphar", "spark/datasources-33/src/main/scala/org/apache/graphar/datasources", "spark/datasources-33/src/main/scala/org/apache/spark/sql/graphar", + "spark/datasources-34/src/main/scala/org/apache/graphar/datasources", + "spark/datasources-34/src/main/scala/org/apache/spark/sql/graphar", + "spark/datasources-35/src/main/scala/org/apache/graphar/datasources", + "spark/datasources-35/src/main/scala/org/apache/spark/sql/graphar", "java/src/main/java/org/apache/graphar/stdcxx/StdString.java", "java/src/main/java/org/apache/graphar/stdcxx/StdVector.java", "java/src/main/java/org/apache/graphar/stdcxx/StdSharedPtr.java", diff --git a/maven-projects/spark/datasources-32/pom.xml b/maven-projects/spark/datasources-32/pom.xml index 265eb4c67..9bde15e4d 100644 --- a/maven-projects/spark/datasources-32/pom.xml +++ b/maven-projects/spark/datasources-32/pom.xml @@ -77,7 +77,7 @@ ${scala.version} - -target:jvm-1.8 + -target:jvm-${maven.compiler.target} -Xss4096K @@ -128,8 +128,8 @@ org.scalameta - semanticdb-scalac_2.12.10 - 4.3.24 + semanticdb-scalac_${scala.version} + ${semanticdb-scalac.version} diff --git a/maven-projects/spark/datasources-33/pom.xml b/maven-projects/spark/datasources-33/pom.xml index 265eb4c67..9bde15e4d 100644 --- a/maven-projects/spark/datasources-33/pom.xml +++ b/maven-projects/spark/datasources-33/pom.xml @@ -77,7 +77,7 @@ ${scala.version} - -target:jvm-1.8 + -target:jvm-${maven.compiler.target} -Xss4096K @@ -128,8 +128,8 @@ org.scalameta - semanticdb-scalac_2.12.10 - 4.3.24 + semanticdb-scalac_${scala.version} + ${semanticdb-scalac.version} diff --git a/maven-projects/spark/datasources-33/src/main/scala/org/apache/spark/sql/graphar/GarScan.scala b/maven-projects/spark/datasources-33/src/main/scala/org/apache/spark/sql/graphar/GarScan.scala index a4d5207b7..e93027634 100644 --- a/maven-projects/spark/datasources-33/src/main/scala/org/apache/spark/sql/graphar/GarScan.scala +++ b/maven-projects/spark/datasources-33/src/main/scala/org/apache/spark/sql/graphar/GarScan.scala @@ -212,7 +212,8 @@ case class GarScan( val parsedOptions = new JSONOptionsInRead( CaseInsensitiveMap(options.asScala.toMap), sparkSession.sessionState.conf.sessionLocalTimeZone, - sparkSession.sessionState.conf.columnNameOfCorruptRecord) + sparkSession.sessionState.conf.columnNameOfCorruptRecord + ) // Check a field requirement for corrupt records here to throw an exception in a driver side ExprUtils.verifyColumnNameOfCorruptRecord( diff --git a/maven-projects/spark/datasources-33/src/main/scala/org/apache/spark/sql/graphar/GarScanBuilder.scala b/maven-projects/spark/datasources-33/src/main/scala/org/apache/spark/sql/graphar/GarScanBuilder.scala index 3b2ca60ee..23b51b6e9 100644 --- a/maven-projects/spark/datasources-33/src/main/scala/org/apache/spark/sql/graphar/GarScanBuilder.scala +++ b/maven-projects/spark/datasources-33/src/main/scala/org/apache/spark/sql/graphar/GarScanBuilder.scala @@ -52,7 +52,7 @@ case class GarScanBuilder( this.filters = dataFilters formatName match { case "csv" => Array.empty[Filter] - case "json" => Array.empty[Filter] + case "json" => Array.empty[Filter] case "orc" => pushedOrcFilters case "parquet" => pushedParquetFilters case _ => @@ -84,9 +84,9 @@ case class GarScanBuilder( // Check if the file format supports nested schema pruning. override protected val supportsNestedSchemaPruning: Boolean = formatName match { - case "csv" => false + case "csv" => false case "json" => false - case "orc" => sparkSession.sessionState.conf.nestedSchemaPruningEnabled + case "orc" => sparkSession.sessionState.conf.nestedSchemaPruningEnabled case "parquet" => sparkSession.sessionState.conf.nestedSchemaPruningEnabled case _ => diff --git a/maven-projects/spark/datasources-33/src/main/scala/org/apache/spark/sql/graphar/GarTable.scala b/maven-projects/spark/datasources-33/src/main/scala/org/apache/spark/sql/graphar/GarTable.scala index e24e9051b..df874ea32 100644 --- a/maven-projects/spark/datasources-33/src/main/scala/org/apache/spark/sql/graphar/GarTable.scala +++ b/maven-projects/spark/datasources-33/src/main/scala/org/apache/spark/sql/graphar/GarTable.scala @@ -86,20 +86,20 @@ case class GarTable( case "parquet" => ParquetUtils.inferSchema(sparkSession, options.asScala.toMap, files) case "json" => { - val parsedOptions = new JSONOptions( - options.asScala.toMap, - sparkSession.sessionState.conf.sessionLocalTimeZone - ) - - JsonDataSource(parsedOptions).inferSchema( - sparkSession, - files, - parsedOptions - ) + val parsedOptions = new JSONOptions( + options.asScala.toMap, + sparkSession.sessionState.conf.sessionLocalTimeZone + ) + + JsonDataSource(parsedOptions).inferSchema( + sparkSession, + files, + parsedOptions + ) } case _ => throw new IllegalArgumentException("Invalid format name: " + formatName) - + } /** Construct a new write builder according to the actual file format. */ diff --git a/maven-projects/spark/datasources-34/.scalafmt.conf b/maven-projects/spark/datasources-34/.scalafmt.conf new file mode 120000 index 000000000..4cb05e831 --- /dev/null +++ b/maven-projects/spark/datasources-34/.scalafmt.conf @@ -0,0 +1 @@ +../.scalafmt.conf \ No newline at end of file diff --git a/maven-projects/spark/datasources-34/pom.xml b/maven-projects/spark/datasources-34/pom.xml new file mode 100644 index 000000000..9bde15e4d --- /dev/null +++ b/maven-projects/spark/datasources-34/pom.xml @@ -0,0 +1,193 @@ + + + + + 4.0.0 + + + org.apache.graphar + spark + ${graphar.version} + ../pom.xml + + + graphar-datasources + ${graphar.version} + jar + + + + org.apache.spark + spark-core_${scala.binary.version} + ${spark.version} + provided + + + org.apache.spark + spark-streaming_${scala.binary.version} + ${spark.version} + provided + + + org.apache.spark + spark-mllib_${scala.binary.version} + ${spark.version} + provided + + + org.apache.spark + spark-sql_${scala.binary.version} + ${spark.version} + provided + + + org.apache.spark + spark-hive_${scala.binary.version} + ${spark.version} + provided + + + + + + + org.scala-tools + maven-scala-plugin + 2.15.2 + + ${scala.version} + + -target:jvm-${maven.compiler.target} + + + -Xss4096K + + + + + scala-compile + + compile + + + + META-INF/*.SF + META-INF/*.DSA + META-INF/*.RSA + + + + + scala-test-compile + + testCompile + + + + + + net.alchim31.maven + scala-maven-plugin + 4.8.0 + + + + compile + testCompile + + + + + + -Xms64m + -Xmx1024m + + + -Ywarn-unused + + + + org.scalameta + semanticdb-scalac_${scala.version} + ${semanticdb-scalac.version} + + + + + + com.diffplug.spotless + spotless-maven-plugin + 2.20.0 + + + + + + + 1.13.0 + + + + + + ${project.basedir}/.scalafmt.conf + + + + + + io.github.evis + scalafix-maven-plugin_2.13 + 0.1.8_0.11.0 + + + org.apache.maven.plugins + maven-source-plugin + + + attach-sources + + jar + + + + + + org.apache.maven.plugins + maven-javadoc-plugin + + + attach-javadocs + + jar + + + + + + maven-site-plugin + 3.7.1 + + + + diff --git a/maven-projects/spark/datasources-34/src/main/java/org/apache/graphar/GeneralParams.java b/maven-projects/spark/datasources-34/src/main/java/org/apache/graphar/GeneralParams.java new file mode 120000 index 000000000..a3915d619 --- /dev/null +++ b/maven-projects/spark/datasources-34/src/main/java/org/apache/graphar/GeneralParams.java @@ -0,0 +1 @@ +../../../../../../../graphar/src/main/java/org/apache/graphar/GeneralParams.java \ No newline at end of file diff --git a/maven-projects/spark/datasources-34/src/main/scala/org/apache/graphar/datasources/GarDataSource.scala b/maven-projects/spark/datasources-34/src/main/scala/org/apache/graphar/datasources/GarDataSource.scala new file mode 100644 index 000000000..e502f82c6 --- /dev/null +++ b/maven-projects/spark/datasources-34/src/main/scala/org/apache/graphar/datasources/GarDataSource.scala @@ -0,0 +1,180 @@ +/* Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.graphar.datasources + +import scala.collection.JavaConverters._ +import scala.util.matching.Regex +import java.util +import com.fasterxml.jackson.databind.ObjectMapper +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.Path +import org.apache.spark.sql.connector.catalog.{Table, TableProvider} +import org.apache.spark.sql.execution.datasources._ +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.execution.datasources.csv.CSVFileFormat +import org.apache.spark.sql.execution.datasources.orc.OrcFileFormat +import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat +import org.apache.spark.sql.execution.datasources.json.JsonFileFormat +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.util.CaseInsensitiveStringMap +import org.apache.spark.sql.sources.DataSourceRegister +import org.apache.spark.sql.connector.expressions.Transform +import org.apache.spark.sql.graphar.GarTable + +// Derived from Apache Spark 3.1.1 +// https://github.com/apache/spark/blob/1d550c4/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileDataSourceV2.scala + +/** + * GarDataSource is a class to provide gar files as the data source for spark. + */ +class GarDataSource extends TableProvider with DataSourceRegister { + private val REDACTION_REPLACEMENT_TEXT = "*********(redacted)" + + /** + * Redact the sensitive information in the given string. + */ + // Copy of redact from graphar Utils + private def redact(regex: Option[Regex], text: String): String = { + regex match { + case None => text + case Some(r) => + if (text == null || text.isEmpty) { + text + } else { + r.replaceAllIn(text, REDACTION_REPLACEMENT_TEXT) + } + } + } + + /** The default fallback file format is Parquet. */ + def fallbackFileFormat: Class[_ <: FileFormat] = classOf[ParquetFileFormat] + + lazy val sparkSession = SparkSession.active + + /** The string that represents the format name. */ + override def shortName(): String = "gar" + + protected def getPaths(map: CaseInsensitiveStringMap): Seq[String] = { + val objectMapper = new ObjectMapper() + val paths = Option(map.get("paths")) + .map { pathStr => + objectMapper.readValue(pathStr, classOf[Array[String]]).toSeq + } + .getOrElse(Seq.empty) + paths ++ Option(map.get("path")).toSeq + } + + protected def getOptionsWithoutPaths( + map: CaseInsensitiveStringMap + ): CaseInsensitiveStringMap = { + val withoutPath = map.asCaseSensitiveMap().asScala.filterKeys { k => + !k.equalsIgnoreCase("path") && !k.equalsIgnoreCase("paths") + } + new CaseInsensitiveStringMap(withoutPath.toMap.asJava) + } + + protected def getTableName( + map: CaseInsensitiveStringMap, + paths: Seq[String] + ): String = { + val hadoopConf = sparkSession.sessionState.newHadoopConfWithOptions( + map.asCaseSensitiveMap().asScala.toMap + ) + val name = shortName() + " " + paths + .map(qualifiedPathName(_, hadoopConf)) + .mkString(",") + redact(sparkSession.sessionState.conf.stringRedactionPattern, name) + } + + private def qualifiedPathName( + path: String, + hadoopConf: Configuration + ): String = { + val hdfsPath = new Path(path) + val fs = hdfsPath.getFileSystem(hadoopConf) + hdfsPath.makeQualified(fs.getUri, fs.getWorkingDirectory).toString + } + + /** Provide a table from the data source. */ + def getTable(options: CaseInsensitiveStringMap): Table = { + val paths = getPaths(options) + val tableName = getTableName(options, paths) + val optionsWithoutPaths = getOptionsWithoutPaths(options) + GarTable( + tableName, + sparkSession, + optionsWithoutPaths, + paths, + None, + getFallbackFileFormat(options) + ) + } + + /** Provide a table from the data source with specific schema. */ + def getTable(options: CaseInsensitiveStringMap, schema: StructType): Table = { + val paths = getPaths(options) + val tableName = getTableName(options, paths) + val optionsWithoutPaths = getOptionsWithoutPaths(options) + GarTable( + tableName, + sparkSession, + optionsWithoutPaths, + paths, + Some(schema), + getFallbackFileFormat(options) + ) + } + + override def supportsExternalMetadata(): Boolean = true + + private var t: Table = null + + override def inferSchema(options: CaseInsensitiveStringMap): StructType = { + if (t == null) t = getTable(options) + t.schema() + } + + override def inferPartitioning( + options: CaseInsensitiveStringMap + ): Array[Transform] = { + Array.empty + } + + override def getTable( + schema: StructType, + partitioning: Array[Transform], + properties: util.Map[String, String] + ): Table = { + // If the table is already loaded during schema inference, return it directly. + if (t != null) { + t + } else { + getTable(new CaseInsensitiveStringMap(properties), schema) + } + } + + // Get the actual fall back file format. + private def getFallbackFileFormat( + options: CaseInsensitiveStringMap + ): Class[_ <: FileFormat] = options.get("fileFormat") match { + case "csv" => classOf[CSVFileFormat] + case "orc" => classOf[OrcFileFormat] + case "parquet" => classOf[ParquetFileFormat] + case "json" => classOf[JsonFileFormat] + case _ => throw new IllegalArgumentException + } +} diff --git a/maven-projects/spark/datasources-34/src/main/scala/org/apache/spark/sql/graphar/GarCommitProtocol.scala b/maven-projects/spark/datasources-34/src/main/scala/org/apache/spark/sql/graphar/GarCommitProtocol.scala new file mode 100644 index 000000000..c6ca79c21 --- /dev/null +++ b/maven-projects/spark/datasources-34/src/main/scala/org/apache/spark/sql/graphar/GarCommitProtocol.scala @@ -0,0 +1,97 @@ +/* Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Derived from Apache Spark 3.3.4 +// https://github.com/apache/spark/blob/18db204/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala + +package org.apache.spark.sql.graphar + +import org.apache.graphar.GeneralParams + +import org.json4s._ +import org.json4s.jackson.JsonMethods._ + +import org.apache.spark.sql.execution.datasources.SQLHadoopMapReduceCommitProtocol +import org.apache.hadoop.mapreduce._ +import org.apache.spark.internal.Logging +import org.apache.spark.internal.io.FileNameSpec + +object GarCommitProtocol { + private def binarySearchPair(aggNums: Array[Int], key: Int): (Int, Int) = { + var low = 0 + var high = aggNums.length - 1 + var mid = 0 + while (low <= high) { + mid = (high + low) / 2; + if ( + aggNums(mid) <= key && (mid == aggNums.length - 1 || aggNums( + mid + 1 + ) > key) + ) { + return (mid, key - aggNums(mid)) + } else if (aggNums(mid) > key) { + high = mid - 1 + } else { + low = mid + 1 + } + } + return (low, key - aggNums(low)) + } +} + +class GarCommitProtocol( + jobId: String, + path: String, + options: Map[String, String], + dynamicPartitionOverwrite: Boolean = false +) extends SQLHadoopMapReduceCommitProtocol( + jobId, + path, + dynamicPartitionOverwrite + ) + with Serializable + with Logging { + + // override getFilename to customize the file name + override def getFilename( + taskContext: TaskAttemptContext, + spec: FileNameSpec + ): String = { + val partitionId = taskContext.getTaskAttemptID.getTaskID.getId + if (options.contains(GeneralParams.offsetStartChunkIndexKey)) { + // offset chunk file name, looks like chunk0 + val chunk_index = + options(GeneralParams.offsetStartChunkIndexKey).toInt + partitionId + return f"chunk$chunk_index" + } + if (options.contains(GeneralParams.aggNumListOfEdgeChunkKey)) { + // edge chunk file name, looks like part0/chunk0 + val jValue = parse( + options(GeneralParams.aggNumListOfEdgeChunkKey) + ) + implicit val formats = + DefaultFormats // initialize a default formats for json4s + val aggNums: Array[Int] = Extraction.extract[Array[Int]](jValue) + val chunkPair: (Int, Int) = + GarCommitProtocol.binarySearchPair(aggNums, partitionId) + val vertex_chunk_index: Int = chunkPair._1 + val edge_chunk_index: Int = chunkPair._2 + return f"part$vertex_chunk_index/chunk$edge_chunk_index" + } + // vertex chunk file name, looks like chunk0 + return f"chunk$partitionId" + } +} diff --git a/maven-projects/spark/datasources-34/src/main/scala/org/apache/spark/sql/graphar/GarScan.scala b/maven-projects/spark/datasources-34/src/main/scala/org/apache/spark/sql/graphar/GarScan.scala new file mode 100644 index 000000000..cde86e5d3 --- /dev/null +++ b/maven-projects/spark/datasources-34/src/main/scala/org/apache/spark/sql/graphar/GarScan.scala @@ -0,0 +1,362 @@ +/* Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Derived from Apache Spark 3.3.4 +// https://github.com/apache/spark/blob/18db204/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileScan.scala + +package org.apache.spark.sql.graphar + +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.Path +import org.apache.parquet.hadoop.ParquetInputFormat +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.csv.CSVOptions +import org.apache.spark.sql.catalyst.json.JSONOptionsInRead +import org.apache.spark.sql.catalyst.expressions.{ExprUtils, Expression} +import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap +import org.apache.spark.sql.connector.read.PartitionReaderFactory +import org.apache.spark.sql.execution.PartitionedFileUtil +import org.apache.spark.sql.execution.datasources.{ + FilePartition, + PartitionedFile, + PartitioningAwareFileIndex +} +import org.apache.spark.sql.execution.datasources.parquet.{ + ParquetOptions, + ParquetReadSupport, + ParquetWriteSupport +} +import org.apache.spark.sql.execution.datasources.v2.FileScan +import org.apache.spark.sql.execution.datasources.v2.csv.CSVPartitionReaderFactory +import org.apache.spark.sql.execution.datasources.v2.json.JsonPartitionReaderFactory +import org.apache.spark.sql.execution.datasources.v2.orc.OrcPartitionReaderFactory +import org.apache.spark.sql.execution.datasources.orc.OrcOptions +import org.apache.spark.sql.execution.datasources.v2.parquet.ParquetPartitionReaderFactory +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.sources.Filter +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.util.CaseInsensitiveStringMap +import org.apache.spark.util.SerializableConfiguration + +import scala.collection.mutable.ArrayBuffer +import scala.jdk.CollectionConverters._ +import org.apache.spark.memory.MemoryMode + +/** GarScan is a class to implement the file scan for GarDataSource. */ +case class GarScan( + sparkSession: SparkSession, + hadoopConf: Configuration, + fileIndex: PartitioningAwareFileIndex, + dataSchema: StructType, + readDataSchema: StructType, + readPartitionSchema: StructType, + pushedFilters: Array[Filter], + options: CaseInsensitiveStringMap, + formatName: String, + partitionFilters: Seq[Expression] = Seq.empty, + dataFilters: Seq[Expression] = Seq.empty +) extends FileScan { + + /** The gar format is not splitable. */ + override def isSplitable(path: Path): Boolean = false + + /** Create the reader factory according to the actual file format. */ + override def createReaderFactory(): PartitionReaderFactory = + formatName match { + case "csv" => createCSVReaderFactory() + case "orc" => createOrcReaderFactory() + case "parquet" => createParquetReaderFactory() + case "json" => createJSONReaderFactory() + case _ => + throw new IllegalArgumentException("Invalid format name: " + formatName) + } + + // Create the reader factory for the CSV format. + private def createCSVReaderFactory(): PartitionReaderFactory = { + val columnPruning = sparkSession.sessionState.conf.csvColumnPruning && + !readDataSchema.exists( + _.name == sparkSession.sessionState.conf.columnNameOfCorruptRecord + ) + + val parsedOptions: CSVOptions = new CSVOptions( + options.asScala.toMap, + columnPruning = columnPruning, + sparkSession.sessionState.conf.sessionLocalTimeZone, + sparkSession.sessionState.conf.columnNameOfCorruptRecord + ) + + // Check a field requirement for corrupt records here to throw an exception in a driver side + ExprUtils.verifyColumnNameOfCorruptRecord( + dataSchema, + parsedOptions.columnNameOfCorruptRecord + ) + // Don't push any filter which refers to the "virtual" column which cannot present in the input. + // Such filters will be applied later on the upper layer. + val actualFilters = + pushedFilters.filterNot( + _.references.contains(parsedOptions.columnNameOfCorruptRecord) + ) + + val caseSensitiveMap = options.asCaseSensitiveMap.asScala.toMap + // Hadoop Configurations are case sensitive. + val hadoopConf = + sparkSession.sessionState.newHadoopConfWithOptions(caseSensitiveMap) + val broadcastedConf = sparkSession.sparkContext.broadcast( + new SerializableConfiguration(hadoopConf) + ) + // The partition values are already truncated in `FileScan.partitions`. + // We should use `readPartitionSchema` as the partition schema here. + CSVPartitionReaderFactory( + sparkSession.sessionState.conf, + broadcastedConf, + dataSchema, + readDataSchema, + readPartitionSchema, + parsedOptions, + actualFilters + ) + } + + // Create the reader factory for the Orc format. + private def createOrcReaderFactory(): PartitionReaderFactory = { + val broadcastedConf = sparkSession.sparkContext.broadcast( + new SerializableConfiguration(hadoopConf) + ) + // The partition values are already truncated in `FileScan.partitions`. + // We should use `readPartitionSchema` as the partition schema here. + OrcPartitionReaderFactory( + sqlConf = sparkSession.sessionState.conf, + broadcastedConf = broadcastedConf, + dataSchema = dataSchema, + readDataSchema = readDataSchema, + partitionSchema = readPartitionSchema, + filters = pushedFilters, + aggregation = None, + options = new OrcOptions( + Map.empty[String, String], + sparkSession.sessionState.conf + ), + memoryMode = MemoryMode.ON_HEAP + ) + } + + // Create the reader factory for the Parquet format. + private def createParquetReaderFactory(): PartitionReaderFactory = { + val readDataSchemaAsJson = readDataSchema.json + hadoopConf.set( + ParquetInputFormat.READ_SUPPORT_CLASS, + classOf[ParquetReadSupport].getName + ) + hadoopConf.set( + ParquetReadSupport.SPARK_ROW_REQUESTED_SCHEMA, + readDataSchemaAsJson + ) + hadoopConf.set(ParquetWriteSupport.SPARK_ROW_SCHEMA, readDataSchemaAsJson) + hadoopConf.set( + SQLConf.SESSION_LOCAL_TIMEZONE.key, + sparkSession.sessionState.conf.sessionLocalTimeZone + ) + hadoopConf.setBoolean( + SQLConf.NESTED_SCHEMA_PRUNING_ENABLED.key, + sparkSession.sessionState.conf.nestedSchemaPruningEnabled + ) + hadoopConf.setBoolean( + SQLConf.CASE_SENSITIVE.key, + sparkSession.sessionState.conf.caseSensitiveAnalysis + ) + + ParquetWriteSupport.setSchema(readDataSchema, hadoopConf) + + // Sets flags for `ParquetToSparkSchemaConverter` + hadoopConf.setBoolean( + SQLConf.PARQUET_BINARY_AS_STRING.key, + sparkSession.sessionState.conf.isParquetBinaryAsString + ) + hadoopConf.setBoolean( + SQLConf.PARQUET_INT96_AS_TIMESTAMP.key, + sparkSession.sessionState.conf.isParquetINT96AsTimestamp + ) + hadoopConf.setBoolean( + SQLConf.LEGACY_PARQUET_NANOS_AS_LONG.key, + sparkSession.sessionState.conf.legacyParquetNanosAsLong + ) + hadoopConf.setBoolean( + SQLConf.PARQUET_FIELD_ID_READ_ENABLED.key, + sparkSession.sessionState.conf.parquetFieldIdReadEnabled + ) + hadoopConf.setBoolean( + SQLConf.PARQUET_INFER_TIMESTAMP_NTZ_ENABLED.key, + sparkSession.sessionState.conf.parquetInferTimestampNTZEnabled + ) + + val broadcastedConf = sparkSession.sparkContext.broadcast( + new SerializableConfiguration(hadoopConf) + ) + val sqlConf = sparkSession.sessionState.conf + ParquetPartitionReaderFactory( + sqlConf = sqlConf, + broadcastedConf = broadcastedConf, + dataSchema = dataSchema, + readDataSchema = readDataSchema, + partitionSchema = readPartitionSchema, + filters = pushedFilters, + aggregation = None, + new ParquetOptions(options.asCaseSensitiveMap.asScala.toMap, sqlConf) + ) + } + + // Create the reader factory for the JSON format. + private def createJSONReaderFactory(): PartitionReaderFactory = { + val parsedOptions = new JSONOptionsInRead( + CaseInsensitiveMap(options.asScala.toMap), + sparkSession.sessionState.conf.sessionLocalTimeZone, + sparkSession.sessionState.conf.columnNameOfCorruptRecord + ) + + // Check a field requirement for corrupt records here to throw an exception in a driver side + ExprUtils.verifyColumnNameOfCorruptRecord( + dataSchema, + parsedOptions.columnNameOfCorruptRecord + ) + // Don't push any filter which refers to the "virtual" column which cannot present in the input. + // Such filters will be applied later on the upper layer. + val actualFilters = + pushedFilters.filterNot( + _.references.contains(parsedOptions.columnNameOfCorruptRecord) + ) + + val caseSensitiveMap = options.asCaseSensitiveMap.asScala.toMap + // Hadoop Configurations are case sensitive. + val hadoopConf = + sparkSession.sessionState.newHadoopConfWithOptions(caseSensitiveMap) + val broadcastedConf = sparkSession.sparkContext.broadcast( + new SerializableConfiguration(hadoopConf) + ) + // The partition values are already truncated in `FileScan.partitions`. + // We should use `readPartitionSchema` as the partition schema here. + JsonPartitionReaderFactory( + sparkSession.sessionState.conf, + broadcastedConf, + dataSchema, + readDataSchema, + readPartitionSchema, + parsedOptions, + actualFilters + ) + } + + /** + * Override "partitions" of + * org.apache.spark.sql.execution.datasources.v2.FileScan to disable splitting + * and sort the files by file paths instead of by file sizes. Note: This + * implementation does not support to partition attributes. + */ + override protected def partitions: Seq[FilePartition] = { + val selectedPartitions = fileIndex.listFiles(partitionFilters, dataFilters) + val maxSplitBytes = + FilePartition.maxSplitBytes(sparkSession, selectedPartitions) + + val splitFiles = selectedPartitions.flatMap { partition => + val partitionValues = partition.values + partition.files + .flatMap { file => + val filePath = file.getPath + PartitionedFileUtil.splitFiles( + sparkSession = sparkSession, + file = file, + filePath = filePath, + isSplitable = isSplitable(filePath), + maxSplitBytes = maxSplitBytes, + partitionValues = partitionValues + ) + } + .toArray + .sortBy(_.filePath.toPath) + // starting from 3.4 PartitionedFile.filePath is SparkPath, not String + } + + getFilePartitions(sparkSession, splitFiles) + } + + /** + * Override "getFilePartitions" of + * org.apache.spark.sql.execution.datasources.FilePartition to assign each + * chunk file in GraphAr to a single partition. + */ + private def getFilePartitions( + sparkSession: SparkSession, + partitionedFiles: Seq[PartitionedFile] + ): Seq[FilePartition] = { + val partitions = new ArrayBuffer[FilePartition] + val currentFiles = new ArrayBuffer[PartitionedFile] + + /** Close the current partition and move to the next. */ + def closePartition(): Unit = { + if (currentFiles.nonEmpty) { + // Copy to a new Array. + val newPartition = FilePartition(partitions.size, currentFiles.toArray) + partitions += newPartition + } + currentFiles.clear() + } + // Assign a file to each partition + partitionedFiles.foreach { file => + closePartition() + // Add the given file to the current partition. + currentFiles += file + } + closePartition() + partitions.toSeq + } + + /** Check if two objects are equal. */ + override def equals(obj: Any): Boolean = obj match { + case g: GarScan => + super.equals(g) && dataSchema == g.dataSchema && options == g.options && + equivalentFilters( + pushedFilters, + g.pushedFilters + ) && formatName == g.formatName + case _ => false + } + + /** Get the hash code of the object. */ + override def hashCode(): Int = formatName match { + case "csv" => super.hashCode() + case "json" => super.hashCode() + case "orc" => getClass.hashCode() + case "parquet" => getClass.hashCode() + case _ => + throw new IllegalArgumentException("Invalid format name: " + formatName) + } + + /** Get the description string of the object. */ + override def description(): String = { + super.description() + ", PushedFilters: " + seqToString(pushedFilters) + } + + /** Get the meta data map of the object. */ + override def getMetaData(): Map[String, String] = { + super.getMetaData() ++ Map("PushedFilters" -> seqToString(pushedFilters)) + } + + /** Construct the file scan with filters. */ + def withFilters( + partitionFilters: Seq[Expression], + dataFilters: Seq[Expression] + ): FileScan = + this.copy(partitionFilters = partitionFilters, dataFilters = dataFilters) +} diff --git a/maven-projects/spark/datasources-34/src/main/scala/org/apache/spark/sql/graphar/GarScanBuilder.scala b/maven-projects/spark/datasources-34/src/main/scala/org/apache/spark/sql/graphar/GarScanBuilder.scala new file mode 100644 index 000000000..706b72ae3 --- /dev/null +++ b/maven-projects/spark/datasources-34/src/main/scala/org/apache/spark/sql/graphar/GarScanBuilder.scala @@ -0,0 +1,109 @@ +/* Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Derived from Apache Spark 3.3.4 +// https://github.com/apache/spark/blob/18db204/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileScanBuilder.scala + +package org.apache.spark.sql.graphar + +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.connector.read.Scan +import org.apache.spark.sql.execution.datasources.PartitioningAwareFileIndex +import org.apache.spark.sql.execution.datasources.v2.FileScanBuilder +import org.apache.spark.sql.execution.datasources.v2.orc.OrcScanBuilder +import org.apache.spark.sql.execution.datasources.v2.parquet.ParquetScanBuilder +import org.apache.spark.sql.sources.Filter +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.util.CaseInsensitiveStringMap + +import scala.collection.JavaConverters._ + +/** GarScanBuilder is a class to build the file scan for GarDataSource. */ +case class GarScanBuilder( + sparkSession: SparkSession, + fileIndex: PartitioningAwareFileIndex, + schema: StructType, + dataSchema: StructType, + options: CaseInsensitiveStringMap, + formatName: String +) extends FileScanBuilder(sparkSession, fileIndex, dataSchema) { + lazy val hadoopConf = { + val caseSensitiveMap = options.asCaseSensitiveMap.asScala.toMap + // Hadoop Configurations are case sensitive. + sparkSession.sessionState.newHadoopConfWithOptions(caseSensitiveMap) + } + + private var filters: Array[Filter] = Array.empty + + override def pushDataFilters(dataFilters: Array[Filter]): Array[Filter] = { + this.filters = dataFilters + formatName match { + case "csv" => Array.empty[Filter] + case "json" => Array.empty[Filter] + case "orc" => pushedOrcFilters + case "parquet" => pushedParquetFilters + case _ => + throw new IllegalArgumentException("Invalid format name: " + formatName) + } + } + + private lazy val pushedParquetFilters: Array[Filter] = { + if (!sparkSession.sessionState.conf.parquetFilterPushDown) { + Array.empty[Filter] + } else { + val builder = + ParquetScanBuilder(sparkSession, fileIndex, schema, dataSchema, options) + builder.pushDataFilters(this.filters) + } + } + + private lazy val pushedOrcFilters: Array[Filter] = { + if (!sparkSession.sessionState.conf.orcFilterPushDown) { + Array.empty[Filter] + } else { + val builder = + OrcScanBuilder(sparkSession, fileIndex, schema, dataSchema, options) + builder.pushDataFilters(this.filters) + } + } + + // Check if the file format supports nested schema pruning. + override protected val supportsNestedSchemaPruning: Boolean = + formatName match { + case "csv" => false + case "json" => false + case "orc" => sparkSession.sessionState.conf.nestedSchemaPruningEnabled + case "parquet" => + sparkSession.sessionState.conf.nestedSchemaPruningEnabled + case _ => + throw new IllegalArgumentException("Invalid format name: " + formatName) + } + + /** Build the file scan for GarDataSource. */ + override def build(): Scan = { + GarScan( + sparkSession, + hadoopConf, + fileIndex, + dataSchema, + readDataSchema(), + readPartitionSchema(), + pushedDataFilters, + options, + formatName + ) + } +} diff --git a/maven-projects/spark/datasources-34/src/main/scala/org/apache/spark/sql/graphar/GarTable.scala b/maven-projects/spark/datasources-34/src/main/scala/org/apache/spark/sql/graphar/GarTable.scala new file mode 100644 index 000000000..df874ea32 --- /dev/null +++ b/maven-projects/spark/datasources-34/src/main/scala/org/apache/spark/sql/graphar/GarTable.scala @@ -0,0 +1,150 @@ +/* Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Derived from Apache Spark 3.1.1 +// https://github.com/apache/spark/blob/1d550c4/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileTable.scala + +package org.apache.spark.sql.graphar + +import org.apache.hadoop.fs.FileStatus +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.csv.CSVOptions +import org.apache.spark.sql.connector.write.{LogicalWriteInfo, WriteBuilder} +import org.apache.spark.sql.execution.datasources.FileFormat +import org.apache.spark.sql.execution.datasources.csv.CSVDataSource +import org.apache.spark.sql.execution.datasources.orc.OrcUtils +import org.apache.spark.sql.execution.datasources.parquet.ParquetUtils +import org.apache.spark.sql.execution.datasources.v2.FileTable +import org.apache.spark.sql.graphar.csv.CSVWriteBuilder +import org.apache.spark.sql.graphar.orc.OrcWriteBuilder +import org.apache.spark.sql.graphar.parquet.ParquetWriteBuilder +import org.apache.spark.sql.graphar.json.JSONWriteBuilder +import org.apache.spark.sql.types._ +import org.apache.spark.sql.util.CaseInsensitiveStringMap +import org.apache.spark.sql.execution.datasources.json.JsonDataSource +import org.apache.spark.sql.catalyst.json.JSONOptions + +import scala.collection.JavaConverters._ + +/** GarTable is a class to represent the graph data in GraphAr as a table. */ +case class GarTable( + name: String, + sparkSession: SparkSession, + options: CaseInsensitiveStringMap, + paths: Seq[String], + userSpecifiedSchema: Option[StructType], + fallbackFileFormat: Class[_ <: FileFormat] +) extends FileTable(sparkSession, options, paths, userSpecifiedSchema) { + + /** Construct a new scan builder. */ + override def newScanBuilder( + options: CaseInsensitiveStringMap + ): GarScanBuilder = + new GarScanBuilder( + sparkSession, + fileIndex, + schema, + dataSchema, + options, + formatName + ) + + /** + * Infer the schema of the table through the methods of the actual file + * format. + */ + override def inferSchema(files: Seq[FileStatus]): Option[StructType] = + formatName match { + case "csv" => { + val parsedOptions = new CSVOptions( + options.asScala.toMap, + columnPruning = sparkSession.sessionState.conf.csvColumnPruning, + sparkSession.sessionState.conf.sessionLocalTimeZone + ) + + CSVDataSource(parsedOptions).inferSchema( + sparkSession, + files, + parsedOptions + ) + } + case "orc" => + OrcUtils.inferSchema(sparkSession, files, options.asScala.toMap) + case "parquet" => + ParquetUtils.inferSchema(sparkSession, options.asScala.toMap, files) + case "json" => { + val parsedOptions = new JSONOptions( + options.asScala.toMap, + sparkSession.sessionState.conf.sessionLocalTimeZone + ) + + JsonDataSource(parsedOptions).inferSchema( + sparkSession, + files, + parsedOptions + ) + } + case _ => + throw new IllegalArgumentException("Invalid format name: " + formatName) + + } + + /** Construct a new write builder according to the actual file format. */ + override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder = + formatName match { + case "csv" => + new CSVWriteBuilder(paths, formatName, supportsDataType, info) + case "orc" => + new OrcWriteBuilder(paths, formatName, supportsDataType, info) + case "parquet" => + new ParquetWriteBuilder(paths, formatName, supportsDataType, info) + case "json" => + new JSONWriteBuilder(paths, formatName, supportsDataType, info) + case _ => + throw new IllegalArgumentException("Invalid format name: " + formatName) + } + + /** + * Check if a data type is supported. Note: Currently, the GraphAr data source + * only supports several atomic data types. To support additional data types + * such as Struct, Array and Map, revise this function to handle them case by + * case as the commented code shows. + */ + override def supportsDataType(dataType: DataType): Boolean = dataType match { + // case _: AnsiIntervalType => false + + case _: AtomicType => true + + // case st: StructType => st.forall { f => supportsDataType(f.dataType) } + + case ArrayType(elementType, _) => + formatName match { + case "orc" => supportsDataType(elementType) + case "parquet" => supportsDataType(elementType) + case _ => false + } + + // case MapType(keyType, valueType, _) => + // supportsDataType(keyType) && supportsDataType(valueType) + + // case udt: UserDefinedType[_] => supportsDataType(udt.sqlType) + + case _ => false + } + + /** The actual file format for storing the data in GraphAr. */ + override def formatName: String = options.get("fileFormat") +} diff --git a/maven-projects/spark/datasources-34/src/main/scala/org/apache/spark/sql/graphar/GarWriteBuilder.scala b/maven-projects/spark/datasources-34/src/main/scala/org/apache/spark/sql/graphar/GarWriteBuilder.scala new file mode 100644 index 000000000..58f1890da --- /dev/null +++ b/maven-projects/spark/datasources-34/src/main/scala/org/apache/spark/sql/graphar/GarWriteBuilder.scala @@ -0,0 +1,176 @@ +/* Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Derived from Apache Spark 3.3.4 +// https://github.com/apache/spark/blob/18db204/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileWriteBuilder.scala + +package org.apache.spark.sql.graphar + +import java.util.UUID + +import scala.collection.JavaConverters._ + +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.Path +import org.apache.hadoop.mapreduce.Job +import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat + +import org.apache.spark.sql.execution.datasources.OutputWriterFactory +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, DateTimeUtils} +import org.apache.spark.sql.connector.write.{ + BatchWrite, + LogicalWriteInfo, + WriteBuilder +} +import org.apache.spark.sql.execution.datasources.{ + BasicWriteJobStatsTracker, + DataSource, + WriteJobDescription +} +import org.apache.spark.sql.execution.metric.SQLMetric +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types.{DataType, StructType} +import org.apache.spark.util.SerializableConfiguration +import org.apache.spark.sql.execution.datasources.v2.FileBatchWrite +import org.apache.spark.sql.catalyst.expressions.AttributeReference + +abstract class GarWriteBuilder( + paths: Seq[String], + formatName: String, + supportsDataType: DataType => Boolean, + info: LogicalWriteInfo +) extends WriteBuilder { + private val schema = info.schema() + private val queryId = info.queryId() + private val options = info.options() + + override def buildForBatch(): BatchWrite = { + val sparkSession = SparkSession.active + validateInputs(sparkSession.sessionState.conf.caseSensitiveAnalysis) + val path = new Path(paths.head) + val caseSensitiveMap = options.asCaseSensitiveMap.asScala.toMap + // Hadoop Configurations are case sensitive. + val hadoopConf = + sparkSession.sessionState.newHadoopConfWithOptions(caseSensitiveMap) + val job = getJobInstance(hadoopConf, path) + val committer = new GarCommitProtocol( + java.util.UUID.randomUUID().toString, + paths.head, + options.asScala.toMap, + false + ) + lazy val description = + createWriteJobDescription( + sparkSession, + hadoopConf, + job, + paths.head, + options.asScala.toMap + ) + + committer.setupJob(job) + new FileBatchWrite(job, description, committer) + } + + def prepareWrite( + sqlConf: SQLConf, + job: Job, + options: Map[String, String], + dataSchema: StructType + ): OutputWriterFactory + + private def validateInputs(caseSensitiveAnalysis: Boolean): Unit = { + assert(schema != null, "Missing input data schema") + assert(queryId != null, "Missing query ID") + + if (paths.length != 1) { + throw new IllegalArgumentException( + "Expected exactly one path to be specified, but " + + s"got: ${paths.mkString(", ")}" + ) + } + val pathName = paths.head + val sqlConf = SparkSession.active.sessionState.conf + DataSource.validateSchema(schema, sqlConf) + + schema.foreach { field => + if (!supportsDataType(field.dataType)) { + throw new IllegalArgumentException( + s"$formatName data source does not support ${field.dataType.catalogString} data type." + ) + } + } + } + + private def getJobInstance(hadoopConf: Configuration, path: Path): Job = { + val job = Job.getInstance(hadoopConf) + job.setOutputKeyClass(classOf[Void]) + job.setOutputValueClass(classOf[InternalRow]) + FileOutputFormat.setOutputPath(job, path) + job + } + + private def createWriteJobDescription( + sparkSession: SparkSession, + hadoopConf: Configuration, + job: Job, + pathName: String, + options: Map[String, String] + ): WriteJobDescription = { + val caseInsensitiveOptions = CaseInsensitiveMap(options) + // Note: prepareWrite has side effect. It sets "job". + val outputWriterFactory = + prepareWrite( + sparkSession.sessionState.conf, + job, + caseInsensitiveOptions, + schema + ) + // same as schema.toAttributes which is private of spark package + val allColumns: Seq[AttributeReference] = schema.map(f => + AttributeReference(f.name, f.dataType, f.nullable, f.metadata)() + ) + val metrics: Map[String, SQLMetric] = BasicWriteJobStatsTracker.metrics + val serializableHadoopConf = new SerializableConfiguration(hadoopConf) + val statsTracker = + new BasicWriteJobStatsTracker(serializableHadoopConf, metrics) + // TODO: after partitioning is supported in V2: + // 1. filter out partition columns in `dataColumns`. + // 2. Don't use Seq.empty for `partitionColumns`. + new WriteJobDescription( + uuid = UUID.randomUUID().toString, + serializableHadoopConf = + new SerializableConfiguration(job.getConfiguration), + outputWriterFactory = outputWriterFactory, + allColumns = allColumns, + dataColumns = allColumns, + partitionColumns = Seq.empty, + bucketSpec = None, + path = pathName, + customPartitionLocations = Map.empty, + maxRecordsPerFile = caseInsensitiveOptions + .get("maxRecordsPerFile") + .map(_.toLong) + .getOrElse(sparkSession.sessionState.conf.maxRecordsPerFile), + timeZoneId = caseInsensitiveOptions + .get(DateTimeUtils.TIMEZONE_OPTION) + .getOrElse(sparkSession.sessionState.conf.sessionLocalTimeZone), + statsTrackers = Seq(statsTracker) + ) + } +} diff --git a/maven-projects/spark/datasources-34/src/main/scala/org/apache/spark/sql/graphar/csv/CSVWriteBuilder.scala b/maven-projects/spark/datasources-34/src/main/scala/org/apache/spark/sql/graphar/csv/CSVWriteBuilder.scala new file mode 100644 index 000000000..68e156e07 --- /dev/null +++ b/maven-projects/spark/datasources-34/src/main/scala/org/apache/spark/sql/graphar/csv/CSVWriteBuilder.scala @@ -0,0 +1,72 @@ +/* Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Derived from Apache Spark 3.1.1 +// https://github.com/apache/spark/blob/1d550c4/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/csv/CSVWriteBuilder.scala + +package org.apache.spark.sql.graphar.csv + +import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext} +import org.apache.spark.sql.catalyst.csv.CSVOptions +import org.apache.spark.sql.catalyst.util.CompressionCodecs +import org.apache.spark.sql.connector.write.LogicalWriteInfo +import org.apache.spark.sql.execution.datasources.{ + CodecStreams, + OutputWriter, + OutputWriterFactory +} +import org.apache.spark.sql.execution.datasources.csv.CsvOutputWriter +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types.{DataType, StructType} +import org.apache.spark.sql.graphar.GarWriteBuilder + +class CSVWriteBuilder( + paths: Seq[String], + formatName: String, + supportsDataType: DataType => Boolean, + info: LogicalWriteInfo +) extends GarWriteBuilder(paths, formatName, supportsDataType, info) { + override def prepareWrite( + sqlConf: SQLConf, + job: Job, + options: Map[String, String], + dataSchema: StructType + ): OutputWriterFactory = { + val conf = job.getConfiguration + val csvOptions = new CSVOptions( + options, + columnPruning = sqlConf.csvColumnPruning, + sqlConf.sessionLocalTimeZone + ) + csvOptions.compressionCodec.foreach { codec => + CompressionCodecs.setCodecConfiguration(conf, codec) + } + + new OutputWriterFactory { + override def newInstance( + path: String, + dataSchema: StructType, + context: TaskAttemptContext + ): OutputWriter = { + new CsvOutputWriter(path, dataSchema, context, csvOptions) + } + + override def getFileExtension(context: TaskAttemptContext): String = { + ".csv" + CodecStreams.getCompressionExtension(context) + } + } + } +} diff --git a/maven-projects/spark/datasources-34/src/main/scala/org/apache/spark/sql/graphar/json/JSONWriteBuilder.scala b/maven-projects/spark/datasources-34/src/main/scala/org/apache/spark/sql/graphar/json/JSONWriteBuilder.scala new file mode 100644 index 000000000..150a9a9f8 --- /dev/null +++ b/maven-projects/spark/datasources-34/src/main/scala/org/apache/spark/sql/graphar/json/JSONWriteBuilder.scala @@ -0,0 +1,73 @@ +/* Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Derived from Apache Spark 3.5.1 +// https://github.com/apache/spark/blob/1d550c4/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/json/JsonWriteBuilder.scala + +package org.apache.spark.sql.graphar.json +import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext} +import org.apache.spark.sql.catalyst.util.CompressionCodecs +import org.apache.spark.sql.connector.write.LogicalWriteInfo +import org.apache.spark.sql.execution.datasources.json.JsonOutputWriter +import org.apache.spark.sql.execution.datasources.{ + CodecStreams, + OutputWriter, + OutputWriterFactory +} + +import org.apache.spark.sql.catalyst.json.JSONOptions +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types.{StructType, DataType} + +import org.apache.spark.sql.graphar.GarWriteBuilder + +class JSONWriteBuilder( + paths: Seq[String], + formatName: String, + supportsDataType: DataType => Boolean, + info: LogicalWriteInfo +) extends GarWriteBuilder(paths, formatName, supportsDataType, info) { + override def prepareWrite( + sqlConf: SQLConf, + job: Job, + options: Map[String, String], + dataSchema: StructType + ): OutputWriterFactory = { + val conf = job.getConfiguration + val parsedOptions = new JSONOptions( + options, + sqlConf.sessionLocalTimeZone, + sqlConf.columnNameOfCorruptRecord + ) + parsedOptions.compressionCodec.foreach { codec => + CompressionCodecs.setCodecConfiguration(conf, codec) + } + + new OutputWriterFactory { + override def newInstance( + path: String, + dataSchema: StructType, + context: TaskAttemptContext + ): OutputWriter = { + new JsonOutputWriter(path, parsedOptions, dataSchema, context) + } + + override def getFileExtension(context: TaskAttemptContext): String = { + ".json" + CodecStreams.getCompressionExtension(context) + } + } + } +} diff --git a/maven-projects/spark/datasources-34/src/main/scala/org/apache/spark/sql/graphar/orc/OrcOutputWriter.scala b/maven-projects/spark/datasources-34/src/main/scala/org/apache/spark/sql/graphar/orc/OrcOutputWriter.scala new file mode 100644 index 000000000..ccc7a48e1 --- /dev/null +++ b/maven-projects/spark/datasources-34/src/main/scala/org/apache/spark/sql/graphar/orc/OrcOutputWriter.scala @@ -0,0 +1,70 @@ +/* Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Derived from Apache Spark 3.1.1, since the OrcOutputWriter is private in the original source, +// we have to reimplement it here. +// https://github.com/apache/spark/blob/1d550c4/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcOutputWriter.scala + +package org.apache.spark.sql.graphar.orc + +import org.apache.hadoop.fs.Path +import org.apache.hadoop.io.NullWritable +import org.apache.hadoop.mapreduce.TaskAttemptContext +import org.apache.orc.OrcFile +import org.apache.orc.mapred.{ + OrcOutputFormat => OrcMapRedOutputFormat, + OrcStruct +} +import org.apache.orc.mapreduce.{OrcMapreduceRecordWriter, OrcOutputFormat} + +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.execution.datasources.OutputWriter +import org.apache.spark.sql.execution.datasources.orc.{OrcSerializer, OrcUtils} +import org.apache.spark.sql.types._ + +class OrcOutputWriter( + val path: String, + dataSchema: StructType, + context: TaskAttemptContext +) extends OutputWriter { + + private[this] val serializer = new OrcSerializer(dataSchema) + + private val recordWriter = { + val orcOutputFormat = new OrcOutputFormat[OrcStruct]() { + override def getDefaultWorkFile( + context: TaskAttemptContext, + extension: String + ): Path = { + new Path(path) + } + } + val filename = orcOutputFormat.getDefaultWorkFile(context, ".orc") + val options = OrcMapRedOutputFormat.buildOptions(context.getConfiguration) + val writer = OrcFile.createWriter(filename, options) + val recordWriter = new OrcMapreduceRecordWriter[OrcStruct](writer) + OrcUtils.addSparkVersionMetadata(writer) + recordWriter + } + + override def write(row: InternalRow): Unit = { + recordWriter.write(NullWritable.get(), serializer.serialize(row)) + } + + override def close(): Unit = { + recordWriter.close(context) + } +} diff --git a/maven-projects/spark/datasources-34/src/main/scala/org/apache/spark/sql/graphar/orc/OrcWriteBuilder.scala b/maven-projects/spark/datasources-34/src/main/scala/org/apache/spark/sql/graphar/orc/OrcWriteBuilder.scala new file mode 100644 index 000000000..287162f8e --- /dev/null +++ b/maven-projects/spark/datasources-34/src/main/scala/org/apache/spark/sql/graphar/orc/OrcWriteBuilder.scala @@ -0,0 +1,104 @@ +/* Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Derived from Apache Spark 3.1.1 +// https://github.com/apache/spark/blob/1d550c4/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/orc/ORCWriteBuilder.scala + +package org.apache.spark.sql.graphar.orc + +import org.apache.hadoop.mapred.JobConf +import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext} +import org.apache.orc.OrcConf.{COMPRESS, MAPRED_OUTPUT_SCHEMA} +import org.apache.orc.mapred.OrcStruct + +import org.apache.spark.sql.connector.write.LogicalWriteInfo +import org.apache.spark.sql.execution.datasources.{ + OutputWriter, + OutputWriterFactory +} +import org.apache.spark.sql.execution.datasources.orc.{OrcOptions, OrcUtils} +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types._ + +import org.apache.spark.sql.graphar.GarWriteBuilder + +object OrcWriteBuilder { + // the getQuotedSchemaString method of spark OrcFileFormat + private def getQuotedSchemaString(dataType: DataType): String = + dataType match { + case StructType(fields) => + fields + .map(f => s"`${f.name}`:${getQuotedSchemaString(f.dataType)}") + .mkString("struct<", ",", ">") + case ArrayType(elementType, _) => + s"array<${getQuotedSchemaString(elementType)}>" + case MapType(keyType, valueType, _) => + s"map<${getQuotedSchemaString(keyType)},${getQuotedSchemaString(valueType)}>" + case _ => // UDT and others + dataType.catalogString + } +} + +class OrcWriteBuilder( + paths: Seq[String], + formatName: String, + supportsDataType: DataType => Boolean, + info: LogicalWriteInfo +) extends GarWriteBuilder(paths, formatName, supportsDataType, info) { + + override def prepareWrite( + sqlConf: SQLConf, + job: Job, + options: Map[String, String], + dataSchema: StructType + ): OutputWriterFactory = { + val orcOptions = new OrcOptions(options, sqlConf) + + val conf = job.getConfiguration + + conf.set( + MAPRED_OUTPUT_SCHEMA.getAttribute, + OrcWriteBuilder.getQuotedSchemaString(dataSchema) + ) + + conf.set(COMPRESS.getAttribute, orcOptions.compressionCodec) + + conf + .asInstanceOf[JobConf] + .setOutputFormat( + classOf[org.apache.orc.mapred.OrcOutputFormat[OrcStruct]] + ) + + new OutputWriterFactory { + override def newInstance( + path: String, + dataSchema: StructType, + context: TaskAttemptContext + ): OutputWriter = { + new OrcOutputWriter(path, dataSchema, context) + } + + override def getFileExtension(context: TaskAttemptContext): String = { + val compressionExtension: String = { + val name = context.getConfiguration.get(COMPRESS.getAttribute) + OrcUtils.extensionsForCompressionCodecNames.getOrElse(name, "") + } + + compressionExtension + ".orc" + } + } + } +} diff --git a/maven-projects/spark/datasources-34/src/main/scala/org/apache/spark/sql/graphar/parquet/ParquetWriteBuilder.scala b/maven-projects/spark/datasources-34/src/main/scala/org/apache/spark/sql/graphar/parquet/ParquetWriteBuilder.scala new file mode 100644 index 000000000..8e53dc5f8 --- /dev/null +++ b/maven-projects/spark/datasources-34/src/main/scala/org/apache/spark/sql/graphar/parquet/ParquetWriteBuilder.scala @@ -0,0 +1,152 @@ +/* Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Derived from Apache Spark 3.1.1 +// https://github.com/apache/spark/blob/1d550c4/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/parquet/ParquetWriteBuilder.scala + +package org.apache.spark.sql.graphar.parquet + +import org.apache.hadoop.mapreduce.{Job, OutputCommitter, TaskAttemptContext} +import org.apache.parquet.hadoop.{ParquetOutputCommitter, ParquetOutputFormat} +import org.apache.parquet.hadoop.ParquetOutputFormat.JobSummaryLevel +import org.apache.parquet.hadoop.codec.CodecConfig +import org.apache.parquet.hadoop.util.ContextUtil + +import org.apache.spark.internal.Logging +import org.apache.spark.sql.Row +import org.apache.spark.sql.connector.write.LogicalWriteInfo +import org.apache.spark.sql.execution.datasources.{ + OutputWriter, + OutputWriterFactory +} +import org.apache.spark.sql.execution.datasources.parquet._ +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types._ + +import org.apache.spark.sql.graphar.GarWriteBuilder + +class ParquetWriteBuilder( + paths: Seq[String], + formatName: String, + supportsDataType: DataType => Boolean, + info: LogicalWriteInfo +) extends GarWriteBuilder(paths, formatName, supportsDataType, info) + with Logging { + + override def prepareWrite( + sqlConf: SQLConf, + job: Job, + options: Map[String, String], + dataSchema: StructType + ): OutputWriterFactory = { + val parquetOptions = new ParquetOptions(options, sqlConf) + + val conf = ContextUtil.getConfiguration(job) + + val committerClass = + conf.getClass( + SQLConf.PARQUET_OUTPUT_COMMITTER_CLASS.key, + classOf[ParquetOutputCommitter], + classOf[OutputCommitter] + ) + + if (conf.get(SQLConf.PARQUET_OUTPUT_COMMITTER_CLASS.key) == null) { + logInfo( + "Using default output committer for Parquet: " + + classOf[ParquetOutputCommitter].getCanonicalName + ) + } else { + logInfo( + "Using user defined output committer for Parquet: " + committerClass.getCanonicalName + ) + } + + conf.setClass( + SQLConf.OUTPUT_COMMITTER_CLASS.key, + committerClass, + classOf[OutputCommitter] + ) + + // We're not really using `ParquetOutputFormat[Row]` for writing data here, because we override + // it in `ParquetOutputWriter` to support appending and dynamic partitioning. The reason why + // we set it here is to setup the output committer class to `ParquetOutputCommitter`, which is + // bundled with `ParquetOutputFormat[Row]`. + job.setOutputFormatClass(classOf[ParquetOutputFormat[Row]]) + + ParquetOutputFormat.setWriteSupportClass(job, classOf[ParquetWriteSupport]) + + // This metadata is useful for keeping UDTs like Vector/Matrix. + ParquetWriteSupport.setSchema(dataSchema, conf) + + // Sets flags for `ParquetWriteSupport`, which converts Catalyst schema to Parquet + // schema and writes actual rows to Parquet files. + conf.set( + SQLConf.PARQUET_WRITE_LEGACY_FORMAT.key, + sqlConf.writeLegacyParquetFormat.toString + ) + + conf.set( + SQLConf.PARQUET_OUTPUT_TIMESTAMP_TYPE.key, + sqlConf.parquetOutputTimestampType.toString + ) + + // Sets compression scheme + conf.set( + ParquetOutputFormat.COMPRESSION, + parquetOptions.compressionCodecClassName + ) + + // ParquetOutputWriter required fields starting from 3.3.x + conf.set( + SQLConf.PARQUET_FIELD_ID_WRITE_ENABLED.key, + sqlConf.parquetFieldIdWriteEnabled.toString + ) + + // SPARK-15719: Disables writing Parquet summary files by default. + if ( + conf.get(ParquetOutputFormat.JOB_SUMMARY_LEVEL) == null + && conf.get(ParquetOutputFormat.ENABLE_JOB_SUMMARY) == null + ) { + conf.setEnum(ParquetOutputFormat.JOB_SUMMARY_LEVEL, JobSummaryLevel.NONE) + } + + if ( + ParquetOutputFormat.getJobSummaryLevel(conf) == JobSummaryLevel.NONE + && !classOf[ParquetOutputCommitter].isAssignableFrom(committerClass) + ) { + // output summary is requested, but the class is not a Parquet Committer + logWarning( + s"Committer $committerClass is not a ParquetOutputCommitter and cannot" + + s" create job summaries. " + + s"Set Parquet option ${ParquetOutputFormat.JOB_SUMMARY_LEVEL} to NONE." + ) + } + + new OutputWriterFactory { + override def newInstance( + path: String, + dataSchema: StructType, + context: TaskAttemptContext + ): OutputWriter = { + new ParquetOutputWriter(path, context) + } + + override def getFileExtension(context: TaskAttemptContext): String = { + CodecConfig.from(context).getCodec.getExtension + ".parquet" + } + } + } +} diff --git a/maven-projects/spark/datasources-35/.scalafmt.conf b/maven-projects/spark/datasources-35/.scalafmt.conf new file mode 120000 index 000000000..4cb05e831 --- /dev/null +++ b/maven-projects/spark/datasources-35/.scalafmt.conf @@ -0,0 +1 @@ +../.scalafmt.conf \ No newline at end of file diff --git a/maven-projects/spark/datasources-35/pom.xml b/maven-projects/spark/datasources-35/pom.xml new file mode 100644 index 000000000..db65159af --- /dev/null +++ b/maven-projects/spark/datasources-35/pom.xml @@ -0,0 +1,198 @@ + + + + + 4.0.0 + + + org.apache.graphar + spark + ${graphar.version} + ../pom.xml + + + graphar-datasources + ${graphar.version} + jar + + + + org.apache.spark + spark-core_${scala.binary.version} + ${spark.version} + provided + + + org.apache.spark + spark-streaming_${scala.binary.version} + ${spark.version} + provided + + + org.apache.spark + spark-mllib_${scala.binary.version} + ${spark.version} + provided + + + org.apache.spark + spark-sql_${scala.binary.version} + ${spark.version} + provided + + + org.apache.spark + spark-hive_${scala.binary.version} + ${spark.version} + provided + + + com.fasterxml.jackson.core + jackson-databind + ${jackson.version} + + + + + + + org.scala-tools + maven-scala-plugin + 2.15.2 + + ${scala.version} + + -target:jvm-${maven.compiler.target} + + + -Xss4096K + + + + + scala-compile + + compile + + + + META-INF/*.SF + META-INF/*.DSA + META-INF/*.RSA + + + + + scala-test-compile + + testCompile + + + + + + net.alchim31.maven + scala-maven-plugin + 4.8.0 + + + + compile + testCompile + + + + + + -Xms64m + -Xmx1024m + + + -Ywarn-unused + + + + org.scalameta + semanticdb-scalac_${scala.version} + ${semanticdb-scalac.version} + + + + + + com.diffplug.spotless + spotless-maven-plugin + 2.20.0 + + + + + + + 1.13.0 + + + + + + ${project.basedir}/.scalafmt.conf + + + + + + io.github.evis + scalafix-maven-plugin_2.13 + 0.1.8_0.11.0 + + + org.apache.maven.plugins + maven-source-plugin + + + attach-sources + + jar + + + + + + org.apache.maven.plugins + maven-javadoc-plugin + + + attach-javadocs + + jar + + + + + + maven-site-plugin + 3.7.1 + + + + diff --git a/maven-projects/spark/datasources-35/src/main/java/org/apache/graphar/GeneralParams.java b/maven-projects/spark/datasources-35/src/main/java/org/apache/graphar/GeneralParams.java new file mode 120000 index 000000000..a3915d619 --- /dev/null +++ b/maven-projects/spark/datasources-35/src/main/java/org/apache/graphar/GeneralParams.java @@ -0,0 +1 @@ +../../../../../../../graphar/src/main/java/org/apache/graphar/GeneralParams.java \ No newline at end of file diff --git a/maven-projects/spark/datasources-35/src/main/scala/org/apache/graphar/datasources/GarDataSource.scala b/maven-projects/spark/datasources-35/src/main/scala/org/apache/graphar/datasources/GarDataSource.scala new file mode 100644 index 000000000..e502f82c6 --- /dev/null +++ b/maven-projects/spark/datasources-35/src/main/scala/org/apache/graphar/datasources/GarDataSource.scala @@ -0,0 +1,180 @@ +/* Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.graphar.datasources + +import scala.collection.JavaConverters._ +import scala.util.matching.Regex +import java.util +import com.fasterxml.jackson.databind.ObjectMapper +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.Path +import org.apache.spark.sql.connector.catalog.{Table, TableProvider} +import org.apache.spark.sql.execution.datasources._ +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.execution.datasources.csv.CSVFileFormat +import org.apache.spark.sql.execution.datasources.orc.OrcFileFormat +import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat +import org.apache.spark.sql.execution.datasources.json.JsonFileFormat +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.util.CaseInsensitiveStringMap +import org.apache.spark.sql.sources.DataSourceRegister +import org.apache.spark.sql.connector.expressions.Transform +import org.apache.spark.sql.graphar.GarTable + +// Derived from Apache Spark 3.1.1 +// https://github.com/apache/spark/blob/1d550c4/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileDataSourceV2.scala + +/** + * GarDataSource is a class to provide gar files as the data source for spark. + */ +class GarDataSource extends TableProvider with DataSourceRegister { + private val REDACTION_REPLACEMENT_TEXT = "*********(redacted)" + + /** + * Redact the sensitive information in the given string. + */ + // Copy of redact from graphar Utils + private def redact(regex: Option[Regex], text: String): String = { + regex match { + case None => text + case Some(r) => + if (text == null || text.isEmpty) { + text + } else { + r.replaceAllIn(text, REDACTION_REPLACEMENT_TEXT) + } + } + } + + /** The default fallback file format is Parquet. */ + def fallbackFileFormat: Class[_ <: FileFormat] = classOf[ParquetFileFormat] + + lazy val sparkSession = SparkSession.active + + /** The string that represents the format name. */ + override def shortName(): String = "gar" + + protected def getPaths(map: CaseInsensitiveStringMap): Seq[String] = { + val objectMapper = new ObjectMapper() + val paths = Option(map.get("paths")) + .map { pathStr => + objectMapper.readValue(pathStr, classOf[Array[String]]).toSeq + } + .getOrElse(Seq.empty) + paths ++ Option(map.get("path")).toSeq + } + + protected def getOptionsWithoutPaths( + map: CaseInsensitiveStringMap + ): CaseInsensitiveStringMap = { + val withoutPath = map.asCaseSensitiveMap().asScala.filterKeys { k => + !k.equalsIgnoreCase("path") && !k.equalsIgnoreCase("paths") + } + new CaseInsensitiveStringMap(withoutPath.toMap.asJava) + } + + protected def getTableName( + map: CaseInsensitiveStringMap, + paths: Seq[String] + ): String = { + val hadoopConf = sparkSession.sessionState.newHadoopConfWithOptions( + map.asCaseSensitiveMap().asScala.toMap + ) + val name = shortName() + " " + paths + .map(qualifiedPathName(_, hadoopConf)) + .mkString(",") + redact(sparkSession.sessionState.conf.stringRedactionPattern, name) + } + + private def qualifiedPathName( + path: String, + hadoopConf: Configuration + ): String = { + val hdfsPath = new Path(path) + val fs = hdfsPath.getFileSystem(hadoopConf) + hdfsPath.makeQualified(fs.getUri, fs.getWorkingDirectory).toString + } + + /** Provide a table from the data source. */ + def getTable(options: CaseInsensitiveStringMap): Table = { + val paths = getPaths(options) + val tableName = getTableName(options, paths) + val optionsWithoutPaths = getOptionsWithoutPaths(options) + GarTable( + tableName, + sparkSession, + optionsWithoutPaths, + paths, + None, + getFallbackFileFormat(options) + ) + } + + /** Provide a table from the data source with specific schema. */ + def getTable(options: CaseInsensitiveStringMap, schema: StructType): Table = { + val paths = getPaths(options) + val tableName = getTableName(options, paths) + val optionsWithoutPaths = getOptionsWithoutPaths(options) + GarTable( + tableName, + sparkSession, + optionsWithoutPaths, + paths, + Some(schema), + getFallbackFileFormat(options) + ) + } + + override def supportsExternalMetadata(): Boolean = true + + private var t: Table = null + + override def inferSchema(options: CaseInsensitiveStringMap): StructType = { + if (t == null) t = getTable(options) + t.schema() + } + + override def inferPartitioning( + options: CaseInsensitiveStringMap + ): Array[Transform] = { + Array.empty + } + + override def getTable( + schema: StructType, + partitioning: Array[Transform], + properties: util.Map[String, String] + ): Table = { + // If the table is already loaded during schema inference, return it directly. + if (t != null) { + t + } else { + getTable(new CaseInsensitiveStringMap(properties), schema) + } + } + + // Get the actual fall back file format. + private def getFallbackFileFormat( + options: CaseInsensitiveStringMap + ): Class[_ <: FileFormat] = options.get("fileFormat") match { + case "csv" => classOf[CSVFileFormat] + case "orc" => classOf[OrcFileFormat] + case "parquet" => classOf[ParquetFileFormat] + case "json" => classOf[JsonFileFormat] + case _ => throw new IllegalArgumentException + } +} diff --git a/maven-projects/spark/datasources-35/src/main/scala/org/apache/spark/sql/graphar/GarCommitProtocol.scala b/maven-projects/spark/datasources-35/src/main/scala/org/apache/spark/sql/graphar/GarCommitProtocol.scala new file mode 100644 index 000000000..c6ca79c21 --- /dev/null +++ b/maven-projects/spark/datasources-35/src/main/scala/org/apache/spark/sql/graphar/GarCommitProtocol.scala @@ -0,0 +1,97 @@ +/* Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Derived from Apache Spark 3.3.4 +// https://github.com/apache/spark/blob/18db204/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala + +package org.apache.spark.sql.graphar + +import org.apache.graphar.GeneralParams + +import org.json4s._ +import org.json4s.jackson.JsonMethods._ + +import org.apache.spark.sql.execution.datasources.SQLHadoopMapReduceCommitProtocol +import org.apache.hadoop.mapreduce._ +import org.apache.spark.internal.Logging +import org.apache.spark.internal.io.FileNameSpec + +object GarCommitProtocol { + private def binarySearchPair(aggNums: Array[Int], key: Int): (Int, Int) = { + var low = 0 + var high = aggNums.length - 1 + var mid = 0 + while (low <= high) { + mid = (high + low) / 2; + if ( + aggNums(mid) <= key && (mid == aggNums.length - 1 || aggNums( + mid + 1 + ) > key) + ) { + return (mid, key - aggNums(mid)) + } else if (aggNums(mid) > key) { + high = mid - 1 + } else { + low = mid + 1 + } + } + return (low, key - aggNums(low)) + } +} + +class GarCommitProtocol( + jobId: String, + path: String, + options: Map[String, String], + dynamicPartitionOverwrite: Boolean = false +) extends SQLHadoopMapReduceCommitProtocol( + jobId, + path, + dynamicPartitionOverwrite + ) + with Serializable + with Logging { + + // override getFilename to customize the file name + override def getFilename( + taskContext: TaskAttemptContext, + spec: FileNameSpec + ): String = { + val partitionId = taskContext.getTaskAttemptID.getTaskID.getId + if (options.contains(GeneralParams.offsetStartChunkIndexKey)) { + // offset chunk file name, looks like chunk0 + val chunk_index = + options(GeneralParams.offsetStartChunkIndexKey).toInt + partitionId + return f"chunk$chunk_index" + } + if (options.contains(GeneralParams.aggNumListOfEdgeChunkKey)) { + // edge chunk file name, looks like part0/chunk0 + val jValue = parse( + options(GeneralParams.aggNumListOfEdgeChunkKey) + ) + implicit val formats = + DefaultFormats // initialize a default formats for json4s + val aggNums: Array[Int] = Extraction.extract[Array[Int]](jValue) + val chunkPair: (Int, Int) = + GarCommitProtocol.binarySearchPair(aggNums, partitionId) + val vertex_chunk_index: Int = chunkPair._1 + val edge_chunk_index: Int = chunkPair._2 + return f"part$vertex_chunk_index/chunk$edge_chunk_index" + } + // vertex chunk file name, looks like chunk0 + return f"chunk$partitionId" + } +} diff --git a/maven-projects/spark/datasources-35/src/main/scala/org/apache/spark/sql/graphar/GarScan.scala b/maven-projects/spark/datasources-35/src/main/scala/org/apache/spark/sql/graphar/GarScan.scala new file mode 100644 index 000000000..11fd6a1dc --- /dev/null +++ b/maven-projects/spark/datasources-35/src/main/scala/org/apache/spark/sql/graphar/GarScan.scala @@ -0,0 +1,361 @@ +/* Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Derived from Apache Spark 3.3.4 +// https://github.com/apache/spark/blob/18db204/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileScan.scala + +package org.apache.spark.sql.graphar + +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.Path +import org.apache.parquet.hadoop.ParquetInputFormat +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.csv.CSVOptions +import org.apache.spark.sql.catalyst.json.JSONOptionsInRead +import org.apache.spark.sql.catalyst.expressions.{ExprUtils, Expression} +import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap +import org.apache.spark.sql.connector.read.PartitionReaderFactory +import org.apache.spark.sql.execution.PartitionedFileUtil +import org.apache.spark.sql.execution.datasources.{ + FilePartition, + PartitionedFile, + PartitioningAwareFileIndex +} +import org.apache.spark.sql.execution.datasources.parquet.{ + ParquetOptions, + ParquetReadSupport, + ParquetWriteSupport +} +import org.apache.spark.sql.execution.datasources.v2.FileScan +import org.apache.spark.sql.execution.datasources.v2.csv.CSVPartitionReaderFactory +import org.apache.spark.sql.execution.datasources.v2.json.JsonPartitionReaderFactory +import org.apache.spark.sql.execution.datasources.v2.orc.OrcPartitionReaderFactory +import org.apache.spark.sql.execution.datasources.orc.OrcOptions +import org.apache.spark.sql.execution.datasources.v2.parquet.ParquetPartitionReaderFactory +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.sources.Filter +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.util.CaseInsensitiveStringMap +import org.apache.spark.util.SerializableConfiguration + +import scala.collection.mutable.ArrayBuffer +import scala.jdk.CollectionConverters._ +import org.apache.spark.memory.MemoryMode + +/** GarScan is a class to implement the file scan for GarDataSource. */ +case class GarScan( + sparkSession: SparkSession, + hadoopConf: Configuration, + fileIndex: PartitioningAwareFileIndex, + dataSchema: StructType, + readDataSchema: StructType, + readPartitionSchema: StructType, + pushedFilters: Array[Filter], + options: CaseInsensitiveStringMap, + formatName: String, + partitionFilters: Seq[Expression] = Seq.empty, + dataFilters: Seq[Expression] = Seq.empty +) extends FileScan { + + /** The gar format is not splitable. */ + override def isSplitable(path: Path): Boolean = false + + /** Create the reader factory according to the actual file format. */ + override def createReaderFactory(): PartitionReaderFactory = + formatName match { + case "csv" => createCSVReaderFactory() + case "orc" => createOrcReaderFactory() + case "parquet" => createParquetReaderFactory() + case "json" => createJSONReaderFactory() + case _ => + throw new IllegalArgumentException("Invalid format name: " + formatName) + } + + // Create the reader factory for the CSV format. + private def createCSVReaderFactory(): PartitionReaderFactory = { + val columnPruning = sparkSession.sessionState.conf.csvColumnPruning && + !readDataSchema.exists( + _.name == sparkSession.sessionState.conf.columnNameOfCorruptRecord + ) + + val parsedOptions: CSVOptions = new CSVOptions( + options.asScala.toMap, + columnPruning = columnPruning, + sparkSession.sessionState.conf.sessionLocalTimeZone, + sparkSession.sessionState.conf.columnNameOfCorruptRecord + ) + + // Check a field requirement for corrupt records here to throw an exception in a driver side + ExprUtils.verifyColumnNameOfCorruptRecord( + dataSchema, + parsedOptions.columnNameOfCorruptRecord + ) + // Don't push any filter which refers to the "virtual" column which cannot present in the input. + // Such filters will be applied later on the upper layer. + val actualFilters = + pushedFilters.filterNot( + _.references.contains(parsedOptions.columnNameOfCorruptRecord) + ) + + val caseSensitiveMap = options.asCaseSensitiveMap.asScala.toMap + // Hadoop Configurations are case sensitive. + val hadoopConf = + sparkSession.sessionState.newHadoopConfWithOptions(caseSensitiveMap) + val broadcastedConf = sparkSession.sparkContext.broadcast( + new SerializableConfiguration(hadoopConf) + ) + // The partition values are already truncated in `FileScan.partitions`. + // We should use `readPartitionSchema` as the partition schema here. + CSVPartitionReaderFactory( + sparkSession.sessionState.conf, + broadcastedConf, + dataSchema, + readDataSchema, + readPartitionSchema, + parsedOptions, + actualFilters + ) + } + + // Create the reader factory for the Orc format. + private def createOrcReaderFactory(): PartitionReaderFactory = { + val broadcastedConf = sparkSession.sparkContext.broadcast( + new SerializableConfiguration(hadoopConf) + ) + // The partition values are already truncated in `FileScan.partitions`. + // We should use `readPartitionSchema` as the partition schema here. + OrcPartitionReaderFactory( + sqlConf = sparkSession.sessionState.conf, + broadcastedConf = broadcastedConf, + dataSchema = dataSchema, + readDataSchema = readDataSchema, + partitionSchema = readPartitionSchema, + filters = pushedFilters, + aggregation = None, + options = new OrcOptions( + Map.empty[String, String], + sparkSession.sessionState.conf + ), + memoryMode = MemoryMode.ON_HEAP + ) + } + + // Create the reader factory for the Parquet format. + private def createParquetReaderFactory(): PartitionReaderFactory = { + val readDataSchemaAsJson = readDataSchema.json + hadoopConf.set( + ParquetInputFormat.READ_SUPPORT_CLASS, + classOf[ParquetReadSupport].getName + ) + hadoopConf.set( + ParquetReadSupport.SPARK_ROW_REQUESTED_SCHEMA, + readDataSchemaAsJson + ) + hadoopConf.set(ParquetWriteSupport.SPARK_ROW_SCHEMA, readDataSchemaAsJson) + hadoopConf.set( + SQLConf.SESSION_LOCAL_TIMEZONE.key, + sparkSession.sessionState.conf.sessionLocalTimeZone + ) + hadoopConf.setBoolean( + SQLConf.NESTED_SCHEMA_PRUNING_ENABLED.key, + sparkSession.sessionState.conf.nestedSchemaPruningEnabled + ) + hadoopConf.setBoolean( + SQLConf.CASE_SENSITIVE.key, + sparkSession.sessionState.conf.caseSensitiveAnalysis + ) + + ParquetWriteSupport.setSchema(readDataSchema, hadoopConf) + + // Sets flags for `ParquetToSparkSchemaConverter` + hadoopConf.setBoolean( + SQLConf.PARQUET_BINARY_AS_STRING.key, + sparkSession.sessionState.conf.isParquetBinaryAsString + ) + hadoopConf.setBoolean( + SQLConf.PARQUET_INT96_AS_TIMESTAMP.key, + sparkSession.sessionState.conf.isParquetINT96AsTimestamp + ) + hadoopConf.setBoolean( + SQLConf.LEGACY_PARQUET_NANOS_AS_LONG.key, + sparkSession.sessionState.conf.legacyParquetNanosAsLong + ) + hadoopConf.setBoolean( + SQLConf.PARQUET_FIELD_ID_READ_ENABLED.key, + sparkSession.sessionState.conf.parquetFieldIdReadEnabled + ) + hadoopConf.setBoolean( + SQLConf.PARQUET_INFER_TIMESTAMP_NTZ_ENABLED.key, + sparkSession.sessionState.conf.parquetInferTimestampNTZEnabled + ) + + val broadcastedConf = sparkSession.sparkContext.broadcast( + new SerializableConfiguration(hadoopConf) + ) + val sqlConf = sparkSession.sessionState.conf + ParquetPartitionReaderFactory( + sqlConf = sqlConf, + broadcastedConf = broadcastedConf, + dataSchema = dataSchema, + readDataSchema = readDataSchema, + partitionSchema = readPartitionSchema, + filters = pushedFilters, + aggregation = None, + new ParquetOptions(options.asCaseSensitiveMap.asScala.toMap, sqlConf) + ) + } + + // Create the reader factory for the JSON format. + private def createJSONReaderFactory(): PartitionReaderFactory = { + val parsedOptions = new JSONOptionsInRead( + CaseInsensitiveMap(options.asScala.toMap), + sparkSession.sessionState.conf.sessionLocalTimeZone, + sparkSession.sessionState.conf.columnNameOfCorruptRecord + ) + + // Check a field requirement for corrupt records here to throw an exception in a driver side + ExprUtils.verifyColumnNameOfCorruptRecord( + dataSchema, + parsedOptions.columnNameOfCorruptRecord + ) + // Don't push any filter which refers to the "virtual" column which cannot present in the input. + // Such filters will be applied later on the upper layer. + val actualFilters = + pushedFilters.filterNot( + _.references.contains(parsedOptions.columnNameOfCorruptRecord) + ) + + val caseSensitiveMap = options.asCaseSensitiveMap.asScala.toMap + // Hadoop Configurations are case sensitive. + val hadoopConf = + sparkSession.sessionState.newHadoopConfWithOptions(caseSensitiveMap) + val broadcastedConf = sparkSession.sparkContext.broadcast( + new SerializableConfiguration(hadoopConf) + ) + // The partition values are already truncated in `FileScan.partitions`. + // We should use `readPartitionSchema` as the partition schema here. + JsonPartitionReaderFactory( + sparkSession.sessionState.conf, + broadcastedConf, + dataSchema, + readDataSchema, + readPartitionSchema, + parsedOptions, + actualFilters + ) + } + + /** + * Override "partitions" of + * org.apache.spark.sql.execution.datasources.v2.FileScan to disable splitting + * and sort the files by file paths instead of by file sizes. Note: This + * implementation does not support to partition attributes. + */ + override protected def partitions: Seq[FilePartition] = { + val selectedPartitions = fileIndex.listFiles(partitionFilters, dataFilters) + val maxSplitBytes = + FilePartition.maxSplitBytes(sparkSession, selectedPartitions) + + val splitFiles = selectedPartitions.flatMap { partition => + val partitionValues = partition.values + partition.files + .flatMap { file => + val filePath = file.getPath + PartitionedFileUtil.splitFiles( + sparkSession = sparkSession, + file = file, + isSplitable = isSplitable(filePath), + maxSplitBytes = maxSplitBytes, + partitionValues = partitionValues + ) + } + .toArray + .sortBy(_.filePath.toPath) + // starting from 3.4 PartitionedFile.filePath is SparkPath, not String + } + + getFilePartitions(sparkSession, splitFiles) + } + + /** + * Override "getFilePartitions" of + * org.apache.spark.sql.execution.datasources.FilePartition to assign each + * chunk file in GraphAr to a single partition. + */ + private def getFilePartitions( + sparkSession: SparkSession, + partitionedFiles: Seq[PartitionedFile] + ): Seq[FilePartition] = { + val partitions = new ArrayBuffer[FilePartition] + val currentFiles = new ArrayBuffer[PartitionedFile] + + /** Close the current partition and move to the next. */ + def closePartition(): Unit = { + if (currentFiles.nonEmpty) { + // Copy to a new Array. + val newPartition = FilePartition(partitions.size, currentFiles.toArray) + partitions += newPartition + } + currentFiles.clear() + } + // Assign a file to each partition + partitionedFiles.foreach { file => + closePartition() + // Add the given file to the current partition. + currentFiles += file + } + closePartition() + partitions.toSeq + } + + /** Check if two objects are equal. */ + override def equals(obj: Any): Boolean = obj match { + case g: GarScan => + super.equals(g) && dataSchema == g.dataSchema && options == g.options && + equivalentFilters( + pushedFilters, + g.pushedFilters + ) && formatName == g.formatName + case _ => false + } + + /** Get the hash code of the object. */ + override def hashCode(): Int = formatName match { + case "csv" => super.hashCode() + case "json" => super.hashCode() + case "orc" => getClass.hashCode() + case "parquet" => getClass.hashCode() + case _ => + throw new IllegalArgumentException("Invalid format name: " + formatName) + } + + /** Get the description string of the object. */ + override def description(): String = { + super.description() + ", PushedFilters: " + seqToString(pushedFilters) + } + + /** Get the meta data map of the object. */ + override def getMetaData(): Map[String, String] = { + super.getMetaData() ++ Map("PushedFilters" -> seqToString(pushedFilters)) + } + + /** Construct the file scan with filters. */ + def withFilters( + partitionFilters: Seq[Expression], + dataFilters: Seq[Expression] + ): FileScan = + this.copy(partitionFilters = partitionFilters, dataFilters = dataFilters) +} diff --git a/maven-projects/spark/datasources-35/src/main/scala/org/apache/spark/sql/graphar/GarScanBuilder.scala b/maven-projects/spark/datasources-35/src/main/scala/org/apache/spark/sql/graphar/GarScanBuilder.scala new file mode 100644 index 000000000..706b72ae3 --- /dev/null +++ b/maven-projects/spark/datasources-35/src/main/scala/org/apache/spark/sql/graphar/GarScanBuilder.scala @@ -0,0 +1,109 @@ +/* Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Derived from Apache Spark 3.3.4 +// https://github.com/apache/spark/blob/18db204/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileScanBuilder.scala + +package org.apache.spark.sql.graphar + +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.connector.read.Scan +import org.apache.spark.sql.execution.datasources.PartitioningAwareFileIndex +import org.apache.spark.sql.execution.datasources.v2.FileScanBuilder +import org.apache.spark.sql.execution.datasources.v2.orc.OrcScanBuilder +import org.apache.spark.sql.execution.datasources.v2.parquet.ParquetScanBuilder +import org.apache.spark.sql.sources.Filter +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.util.CaseInsensitiveStringMap + +import scala.collection.JavaConverters._ + +/** GarScanBuilder is a class to build the file scan for GarDataSource. */ +case class GarScanBuilder( + sparkSession: SparkSession, + fileIndex: PartitioningAwareFileIndex, + schema: StructType, + dataSchema: StructType, + options: CaseInsensitiveStringMap, + formatName: String +) extends FileScanBuilder(sparkSession, fileIndex, dataSchema) { + lazy val hadoopConf = { + val caseSensitiveMap = options.asCaseSensitiveMap.asScala.toMap + // Hadoop Configurations are case sensitive. + sparkSession.sessionState.newHadoopConfWithOptions(caseSensitiveMap) + } + + private var filters: Array[Filter] = Array.empty + + override def pushDataFilters(dataFilters: Array[Filter]): Array[Filter] = { + this.filters = dataFilters + formatName match { + case "csv" => Array.empty[Filter] + case "json" => Array.empty[Filter] + case "orc" => pushedOrcFilters + case "parquet" => pushedParquetFilters + case _ => + throw new IllegalArgumentException("Invalid format name: " + formatName) + } + } + + private lazy val pushedParquetFilters: Array[Filter] = { + if (!sparkSession.sessionState.conf.parquetFilterPushDown) { + Array.empty[Filter] + } else { + val builder = + ParquetScanBuilder(sparkSession, fileIndex, schema, dataSchema, options) + builder.pushDataFilters(this.filters) + } + } + + private lazy val pushedOrcFilters: Array[Filter] = { + if (!sparkSession.sessionState.conf.orcFilterPushDown) { + Array.empty[Filter] + } else { + val builder = + OrcScanBuilder(sparkSession, fileIndex, schema, dataSchema, options) + builder.pushDataFilters(this.filters) + } + } + + // Check if the file format supports nested schema pruning. + override protected val supportsNestedSchemaPruning: Boolean = + formatName match { + case "csv" => false + case "json" => false + case "orc" => sparkSession.sessionState.conf.nestedSchemaPruningEnabled + case "parquet" => + sparkSession.sessionState.conf.nestedSchemaPruningEnabled + case _ => + throw new IllegalArgumentException("Invalid format name: " + formatName) + } + + /** Build the file scan for GarDataSource. */ + override def build(): Scan = { + GarScan( + sparkSession, + hadoopConf, + fileIndex, + dataSchema, + readDataSchema(), + readPartitionSchema(), + pushedDataFilters, + options, + formatName + ) + } +} diff --git a/maven-projects/spark/datasources-35/src/main/scala/org/apache/spark/sql/graphar/GarTable.scala b/maven-projects/spark/datasources-35/src/main/scala/org/apache/spark/sql/graphar/GarTable.scala new file mode 100644 index 000000000..df874ea32 --- /dev/null +++ b/maven-projects/spark/datasources-35/src/main/scala/org/apache/spark/sql/graphar/GarTable.scala @@ -0,0 +1,150 @@ +/* Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Derived from Apache Spark 3.1.1 +// https://github.com/apache/spark/blob/1d550c4/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileTable.scala + +package org.apache.spark.sql.graphar + +import org.apache.hadoop.fs.FileStatus +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.csv.CSVOptions +import org.apache.spark.sql.connector.write.{LogicalWriteInfo, WriteBuilder} +import org.apache.spark.sql.execution.datasources.FileFormat +import org.apache.spark.sql.execution.datasources.csv.CSVDataSource +import org.apache.spark.sql.execution.datasources.orc.OrcUtils +import org.apache.spark.sql.execution.datasources.parquet.ParquetUtils +import org.apache.spark.sql.execution.datasources.v2.FileTable +import org.apache.spark.sql.graphar.csv.CSVWriteBuilder +import org.apache.spark.sql.graphar.orc.OrcWriteBuilder +import org.apache.spark.sql.graphar.parquet.ParquetWriteBuilder +import org.apache.spark.sql.graphar.json.JSONWriteBuilder +import org.apache.spark.sql.types._ +import org.apache.spark.sql.util.CaseInsensitiveStringMap +import org.apache.spark.sql.execution.datasources.json.JsonDataSource +import org.apache.spark.sql.catalyst.json.JSONOptions + +import scala.collection.JavaConverters._ + +/** GarTable is a class to represent the graph data in GraphAr as a table. */ +case class GarTable( + name: String, + sparkSession: SparkSession, + options: CaseInsensitiveStringMap, + paths: Seq[String], + userSpecifiedSchema: Option[StructType], + fallbackFileFormat: Class[_ <: FileFormat] +) extends FileTable(sparkSession, options, paths, userSpecifiedSchema) { + + /** Construct a new scan builder. */ + override def newScanBuilder( + options: CaseInsensitiveStringMap + ): GarScanBuilder = + new GarScanBuilder( + sparkSession, + fileIndex, + schema, + dataSchema, + options, + formatName + ) + + /** + * Infer the schema of the table through the methods of the actual file + * format. + */ + override def inferSchema(files: Seq[FileStatus]): Option[StructType] = + formatName match { + case "csv" => { + val parsedOptions = new CSVOptions( + options.asScala.toMap, + columnPruning = sparkSession.sessionState.conf.csvColumnPruning, + sparkSession.sessionState.conf.sessionLocalTimeZone + ) + + CSVDataSource(parsedOptions).inferSchema( + sparkSession, + files, + parsedOptions + ) + } + case "orc" => + OrcUtils.inferSchema(sparkSession, files, options.asScala.toMap) + case "parquet" => + ParquetUtils.inferSchema(sparkSession, options.asScala.toMap, files) + case "json" => { + val parsedOptions = new JSONOptions( + options.asScala.toMap, + sparkSession.sessionState.conf.sessionLocalTimeZone + ) + + JsonDataSource(parsedOptions).inferSchema( + sparkSession, + files, + parsedOptions + ) + } + case _ => + throw new IllegalArgumentException("Invalid format name: " + formatName) + + } + + /** Construct a new write builder according to the actual file format. */ + override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder = + formatName match { + case "csv" => + new CSVWriteBuilder(paths, formatName, supportsDataType, info) + case "orc" => + new OrcWriteBuilder(paths, formatName, supportsDataType, info) + case "parquet" => + new ParquetWriteBuilder(paths, formatName, supportsDataType, info) + case "json" => + new JSONWriteBuilder(paths, formatName, supportsDataType, info) + case _ => + throw new IllegalArgumentException("Invalid format name: " + formatName) + } + + /** + * Check if a data type is supported. Note: Currently, the GraphAr data source + * only supports several atomic data types. To support additional data types + * such as Struct, Array and Map, revise this function to handle them case by + * case as the commented code shows. + */ + override def supportsDataType(dataType: DataType): Boolean = dataType match { + // case _: AnsiIntervalType => false + + case _: AtomicType => true + + // case st: StructType => st.forall { f => supportsDataType(f.dataType) } + + case ArrayType(elementType, _) => + formatName match { + case "orc" => supportsDataType(elementType) + case "parquet" => supportsDataType(elementType) + case _ => false + } + + // case MapType(keyType, valueType, _) => + // supportsDataType(keyType) && supportsDataType(valueType) + + // case udt: UserDefinedType[_] => supportsDataType(udt.sqlType) + + case _ => false + } + + /** The actual file format for storing the data in GraphAr. */ + override def formatName: String = options.get("fileFormat") +} diff --git a/maven-projects/spark/datasources-35/src/main/scala/org/apache/spark/sql/graphar/GarWriteBuilder.scala b/maven-projects/spark/datasources-35/src/main/scala/org/apache/spark/sql/graphar/GarWriteBuilder.scala new file mode 100644 index 000000000..58f1890da --- /dev/null +++ b/maven-projects/spark/datasources-35/src/main/scala/org/apache/spark/sql/graphar/GarWriteBuilder.scala @@ -0,0 +1,176 @@ +/* Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Derived from Apache Spark 3.3.4 +// https://github.com/apache/spark/blob/18db204/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileWriteBuilder.scala + +package org.apache.spark.sql.graphar + +import java.util.UUID + +import scala.collection.JavaConverters._ + +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.Path +import org.apache.hadoop.mapreduce.Job +import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat + +import org.apache.spark.sql.execution.datasources.OutputWriterFactory +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, DateTimeUtils} +import org.apache.spark.sql.connector.write.{ + BatchWrite, + LogicalWriteInfo, + WriteBuilder +} +import org.apache.spark.sql.execution.datasources.{ + BasicWriteJobStatsTracker, + DataSource, + WriteJobDescription +} +import org.apache.spark.sql.execution.metric.SQLMetric +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types.{DataType, StructType} +import org.apache.spark.util.SerializableConfiguration +import org.apache.spark.sql.execution.datasources.v2.FileBatchWrite +import org.apache.spark.sql.catalyst.expressions.AttributeReference + +abstract class GarWriteBuilder( + paths: Seq[String], + formatName: String, + supportsDataType: DataType => Boolean, + info: LogicalWriteInfo +) extends WriteBuilder { + private val schema = info.schema() + private val queryId = info.queryId() + private val options = info.options() + + override def buildForBatch(): BatchWrite = { + val sparkSession = SparkSession.active + validateInputs(sparkSession.sessionState.conf.caseSensitiveAnalysis) + val path = new Path(paths.head) + val caseSensitiveMap = options.asCaseSensitiveMap.asScala.toMap + // Hadoop Configurations are case sensitive. + val hadoopConf = + sparkSession.sessionState.newHadoopConfWithOptions(caseSensitiveMap) + val job = getJobInstance(hadoopConf, path) + val committer = new GarCommitProtocol( + java.util.UUID.randomUUID().toString, + paths.head, + options.asScala.toMap, + false + ) + lazy val description = + createWriteJobDescription( + sparkSession, + hadoopConf, + job, + paths.head, + options.asScala.toMap + ) + + committer.setupJob(job) + new FileBatchWrite(job, description, committer) + } + + def prepareWrite( + sqlConf: SQLConf, + job: Job, + options: Map[String, String], + dataSchema: StructType + ): OutputWriterFactory + + private def validateInputs(caseSensitiveAnalysis: Boolean): Unit = { + assert(schema != null, "Missing input data schema") + assert(queryId != null, "Missing query ID") + + if (paths.length != 1) { + throw new IllegalArgumentException( + "Expected exactly one path to be specified, but " + + s"got: ${paths.mkString(", ")}" + ) + } + val pathName = paths.head + val sqlConf = SparkSession.active.sessionState.conf + DataSource.validateSchema(schema, sqlConf) + + schema.foreach { field => + if (!supportsDataType(field.dataType)) { + throw new IllegalArgumentException( + s"$formatName data source does not support ${field.dataType.catalogString} data type." + ) + } + } + } + + private def getJobInstance(hadoopConf: Configuration, path: Path): Job = { + val job = Job.getInstance(hadoopConf) + job.setOutputKeyClass(classOf[Void]) + job.setOutputValueClass(classOf[InternalRow]) + FileOutputFormat.setOutputPath(job, path) + job + } + + private def createWriteJobDescription( + sparkSession: SparkSession, + hadoopConf: Configuration, + job: Job, + pathName: String, + options: Map[String, String] + ): WriteJobDescription = { + val caseInsensitiveOptions = CaseInsensitiveMap(options) + // Note: prepareWrite has side effect. It sets "job". + val outputWriterFactory = + prepareWrite( + sparkSession.sessionState.conf, + job, + caseInsensitiveOptions, + schema + ) + // same as schema.toAttributes which is private of spark package + val allColumns: Seq[AttributeReference] = schema.map(f => + AttributeReference(f.name, f.dataType, f.nullable, f.metadata)() + ) + val metrics: Map[String, SQLMetric] = BasicWriteJobStatsTracker.metrics + val serializableHadoopConf = new SerializableConfiguration(hadoopConf) + val statsTracker = + new BasicWriteJobStatsTracker(serializableHadoopConf, metrics) + // TODO: after partitioning is supported in V2: + // 1. filter out partition columns in `dataColumns`. + // 2. Don't use Seq.empty for `partitionColumns`. + new WriteJobDescription( + uuid = UUID.randomUUID().toString, + serializableHadoopConf = + new SerializableConfiguration(job.getConfiguration), + outputWriterFactory = outputWriterFactory, + allColumns = allColumns, + dataColumns = allColumns, + partitionColumns = Seq.empty, + bucketSpec = None, + path = pathName, + customPartitionLocations = Map.empty, + maxRecordsPerFile = caseInsensitiveOptions + .get("maxRecordsPerFile") + .map(_.toLong) + .getOrElse(sparkSession.sessionState.conf.maxRecordsPerFile), + timeZoneId = caseInsensitiveOptions + .get(DateTimeUtils.TIMEZONE_OPTION) + .getOrElse(sparkSession.sessionState.conf.sessionLocalTimeZone), + statsTrackers = Seq(statsTracker) + ) + } +} diff --git a/maven-projects/spark/datasources-35/src/main/scala/org/apache/spark/sql/graphar/csv/CSVWriteBuilder.scala b/maven-projects/spark/datasources-35/src/main/scala/org/apache/spark/sql/graphar/csv/CSVWriteBuilder.scala new file mode 100644 index 000000000..68e156e07 --- /dev/null +++ b/maven-projects/spark/datasources-35/src/main/scala/org/apache/spark/sql/graphar/csv/CSVWriteBuilder.scala @@ -0,0 +1,72 @@ +/* Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Derived from Apache Spark 3.1.1 +// https://github.com/apache/spark/blob/1d550c4/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/csv/CSVWriteBuilder.scala + +package org.apache.spark.sql.graphar.csv + +import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext} +import org.apache.spark.sql.catalyst.csv.CSVOptions +import org.apache.spark.sql.catalyst.util.CompressionCodecs +import org.apache.spark.sql.connector.write.LogicalWriteInfo +import org.apache.spark.sql.execution.datasources.{ + CodecStreams, + OutputWriter, + OutputWriterFactory +} +import org.apache.spark.sql.execution.datasources.csv.CsvOutputWriter +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types.{DataType, StructType} +import org.apache.spark.sql.graphar.GarWriteBuilder + +class CSVWriteBuilder( + paths: Seq[String], + formatName: String, + supportsDataType: DataType => Boolean, + info: LogicalWriteInfo +) extends GarWriteBuilder(paths, formatName, supportsDataType, info) { + override def prepareWrite( + sqlConf: SQLConf, + job: Job, + options: Map[String, String], + dataSchema: StructType + ): OutputWriterFactory = { + val conf = job.getConfiguration + val csvOptions = new CSVOptions( + options, + columnPruning = sqlConf.csvColumnPruning, + sqlConf.sessionLocalTimeZone + ) + csvOptions.compressionCodec.foreach { codec => + CompressionCodecs.setCodecConfiguration(conf, codec) + } + + new OutputWriterFactory { + override def newInstance( + path: String, + dataSchema: StructType, + context: TaskAttemptContext + ): OutputWriter = { + new CsvOutputWriter(path, dataSchema, context, csvOptions) + } + + override def getFileExtension(context: TaskAttemptContext): String = { + ".csv" + CodecStreams.getCompressionExtension(context) + } + } + } +} diff --git a/maven-projects/spark/datasources-35/src/main/scala/org/apache/spark/sql/graphar/json/JSONWriteBuilder.scala b/maven-projects/spark/datasources-35/src/main/scala/org/apache/spark/sql/graphar/json/JSONWriteBuilder.scala new file mode 100644 index 000000000..150a9a9f8 --- /dev/null +++ b/maven-projects/spark/datasources-35/src/main/scala/org/apache/spark/sql/graphar/json/JSONWriteBuilder.scala @@ -0,0 +1,73 @@ +/* Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Derived from Apache Spark 3.5.1 +// https://github.com/apache/spark/blob/1d550c4/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/json/JsonWriteBuilder.scala + +package org.apache.spark.sql.graphar.json +import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext} +import org.apache.spark.sql.catalyst.util.CompressionCodecs +import org.apache.spark.sql.connector.write.LogicalWriteInfo +import org.apache.spark.sql.execution.datasources.json.JsonOutputWriter +import org.apache.spark.sql.execution.datasources.{ + CodecStreams, + OutputWriter, + OutputWriterFactory +} + +import org.apache.spark.sql.catalyst.json.JSONOptions +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types.{StructType, DataType} + +import org.apache.spark.sql.graphar.GarWriteBuilder + +class JSONWriteBuilder( + paths: Seq[String], + formatName: String, + supportsDataType: DataType => Boolean, + info: LogicalWriteInfo +) extends GarWriteBuilder(paths, formatName, supportsDataType, info) { + override def prepareWrite( + sqlConf: SQLConf, + job: Job, + options: Map[String, String], + dataSchema: StructType + ): OutputWriterFactory = { + val conf = job.getConfiguration + val parsedOptions = new JSONOptions( + options, + sqlConf.sessionLocalTimeZone, + sqlConf.columnNameOfCorruptRecord + ) + parsedOptions.compressionCodec.foreach { codec => + CompressionCodecs.setCodecConfiguration(conf, codec) + } + + new OutputWriterFactory { + override def newInstance( + path: String, + dataSchema: StructType, + context: TaskAttemptContext + ): OutputWriter = { + new JsonOutputWriter(path, parsedOptions, dataSchema, context) + } + + override def getFileExtension(context: TaskAttemptContext): String = { + ".json" + CodecStreams.getCompressionExtension(context) + } + } + } +} diff --git a/maven-projects/spark/datasources-35/src/main/scala/org/apache/spark/sql/graphar/orc/OrcOutputWriter.scala b/maven-projects/spark/datasources-35/src/main/scala/org/apache/spark/sql/graphar/orc/OrcOutputWriter.scala new file mode 100644 index 000000000..ccc7a48e1 --- /dev/null +++ b/maven-projects/spark/datasources-35/src/main/scala/org/apache/spark/sql/graphar/orc/OrcOutputWriter.scala @@ -0,0 +1,70 @@ +/* Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Derived from Apache Spark 3.1.1, since the OrcOutputWriter is private in the original source, +// we have to reimplement it here. +// https://github.com/apache/spark/blob/1d550c4/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcOutputWriter.scala + +package org.apache.spark.sql.graphar.orc + +import org.apache.hadoop.fs.Path +import org.apache.hadoop.io.NullWritable +import org.apache.hadoop.mapreduce.TaskAttemptContext +import org.apache.orc.OrcFile +import org.apache.orc.mapred.{ + OrcOutputFormat => OrcMapRedOutputFormat, + OrcStruct +} +import org.apache.orc.mapreduce.{OrcMapreduceRecordWriter, OrcOutputFormat} + +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.execution.datasources.OutputWriter +import org.apache.spark.sql.execution.datasources.orc.{OrcSerializer, OrcUtils} +import org.apache.spark.sql.types._ + +class OrcOutputWriter( + val path: String, + dataSchema: StructType, + context: TaskAttemptContext +) extends OutputWriter { + + private[this] val serializer = new OrcSerializer(dataSchema) + + private val recordWriter = { + val orcOutputFormat = new OrcOutputFormat[OrcStruct]() { + override def getDefaultWorkFile( + context: TaskAttemptContext, + extension: String + ): Path = { + new Path(path) + } + } + val filename = orcOutputFormat.getDefaultWorkFile(context, ".orc") + val options = OrcMapRedOutputFormat.buildOptions(context.getConfiguration) + val writer = OrcFile.createWriter(filename, options) + val recordWriter = new OrcMapreduceRecordWriter[OrcStruct](writer) + OrcUtils.addSparkVersionMetadata(writer) + recordWriter + } + + override def write(row: InternalRow): Unit = { + recordWriter.write(NullWritable.get(), serializer.serialize(row)) + } + + override def close(): Unit = { + recordWriter.close(context) + } +} diff --git a/maven-projects/spark/datasources-35/src/main/scala/org/apache/spark/sql/graphar/orc/OrcWriteBuilder.scala b/maven-projects/spark/datasources-35/src/main/scala/org/apache/spark/sql/graphar/orc/OrcWriteBuilder.scala new file mode 100644 index 000000000..287162f8e --- /dev/null +++ b/maven-projects/spark/datasources-35/src/main/scala/org/apache/spark/sql/graphar/orc/OrcWriteBuilder.scala @@ -0,0 +1,104 @@ +/* Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Derived from Apache Spark 3.1.1 +// https://github.com/apache/spark/blob/1d550c4/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/orc/ORCWriteBuilder.scala + +package org.apache.spark.sql.graphar.orc + +import org.apache.hadoop.mapred.JobConf +import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext} +import org.apache.orc.OrcConf.{COMPRESS, MAPRED_OUTPUT_SCHEMA} +import org.apache.orc.mapred.OrcStruct + +import org.apache.spark.sql.connector.write.LogicalWriteInfo +import org.apache.spark.sql.execution.datasources.{ + OutputWriter, + OutputWriterFactory +} +import org.apache.spark.sql.execution.datasources.orc.{OrcOptions, OrcUtils} +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types._ + +import org.apache.spark.sql.graphar.GarWriteBuilder + +object OrcWriteBuilder { + // the getQuotedSchemaString method of spark OrcFileFormat + private def getQuotedSchemaString(dataType: DataType): String = + dataType match { + case StructType(fields) => + fields + .map(f => s"`${f.name}`:${getQuotedSchemaString(f.dataType)}") + .mkString("struct<", ",", ">") + case ArrayType(elementType, _) => + s"array<${getQuotedSchemaString(elementType)}>" + case MapType(keyType, valueType, _) => + s"map<${getQuotedSchemaString(keyType)},${getQuotedSchemaString(valueType)}>" + case _ => // UDT and others + dataType.catalogString + } +} + +class OrcWriteBuilder( + paths: Seq[String], + formatName: String, + supportsDataType: DataType => Boolean, + info: LogicalWriteInfo +) extends GarWriteBuilder(paths, formatName, supportsDataType, info) { + + override def prepareWrite( + sqlConf: SQLConf, + job: Job, + options: Map[String, String], + dataSchema: StructType + ): OutputWriterFactory = { + val orcOptions = new OrcOptions(options, sqlConf) + + val conf = job.getConfiguration + + conf.set( + MAPRED_OUTPUT_SCHEMA.getAttribute, + OrcWriteBuilder.getQuotedSchemaString(dataSchema) + ) + + conf.set(COMPRESS.getAttribute, orcOptions.compressionCodec) + + conf + .asInstanceOf[JobConf] + .setOutputFormat( + classOf[org.apache.orc.mapred.OrcOutputFormat[OrcStruct]] + ) + + new OutputWriterFactory { + override def newInstance( + path: String, + dataSchema: StructType, + context: TaskAttemptContext + ): OutputWriter = { + new OrcOutputWriter(path, dataSchema, context) + } + + override def getFileExtension(context: TaskAttemptContext): String = { + val compressionExtension: String = { + val name = context.getConfiguration.get(COMPRESS.getAttribute) + OrcUtils.extensionsForCompressionCodecNames.getOrElse(name, "") + } + + compressionExtension + ".orc" + } + } + } +} diff --git a/maven-projects/spark/datasources-35/src/main/scala/org/apache/spark/sql/graphar/parquet/ParquetWriteBuilder.scala b/maven-projects/spark/datasources-35/src/main/scala/org/apache/spark/sql/graphar/parquet/ParquetWriteBuilder.scala new file mode 100644 index 000000000..8e53dc5f8 --- /dev/null +++ b/maven-projects/spark/datasources-35/src/main/scala/org/apache/spark/sql/graphar/parquet/ParquetWriteBuilder.scala @@ -0,0 +1,152 @@ +/* Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Derived from Apache Spark 3.1.1 +// https://github.com/apache/spark/blob/1d550c4/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/parquet/ParquetWriteBuilder.scala + +package org.apache.spark.sql.graphar.parquet + +import org.apache.hadoop.mapreduce.{Job, OutputCommitter, TaskAttemptContext} +import org.apache.parquet.hadoop.{ParquetOutputCommitter, ParquetOutputFormat} +import org.apache.parquet.hadoop.ParquetOutputFormat.JobSummaryLevel +import org.apache.parquet.hadoop.codec.CodecConfig +import org.apache.parquet.hadoop.util.ContextUtil + +import org.apache.spark.internal.Logging +import org.apache.spark.sql.Row +import org.apache.spark.sql.connector.write.LogicalWriteInfo +import org.apache.spark.sql.execution.datasources.{ + OutputWriter, + OutputWriterFactory +} +import org.apache.spark.sql.execution.datasources.parquet._ +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types._ + +import org.apache.spark.sql.graphar.GarWriteBuilder + +class ParquetWriteBuilder( + paths: Seq[String], + formatName: String, + supportsDataType: DataType => Boolean, + info: LogicalWriteInfo +) extends GarWriteBuilder(paths, formatName, supportsDataType, info) + with Logging { + + override def prepareWrite( + sqlConf: SQLConf, + job: Job, + options: Map[String, String], + dataSchema: StructType + ): OutputWriterFactory = { + val parquetOptions = new ParquetOptions(options, sqlConf) + + val conf = ContextUtil.getConfiguration(job) + + val committerClass = + conf.getClass( + SQLConf.PARQUET_OUTPUT_COMMITTER_CLASS.key, + classOf[ParquetOutputCommitter], + classOf[OutputCommitter] + ) + + if (conf.get(SQLConf.PARQUET_OUTPUT_COMMITTER_CLASS.key) == null) { + logInfo( + "Using default output committer for Parquet: " + + classOf[ParquetOutputCommitter].getCanonicalName + ) + } else { + logInfo( + "Using user defined output committer for Parquet: " + committerClass.getCanonicalName + ) + } + + conf.setClass( + SQLConf.OUTPUT_COMMITTER_CLASS.key, + committerClass, + classOf[OutputCommitter] + ) + + // We're not really using `ParquetOutputFormat[Row]` for writing data here, because we override + // it in `ParquetOutputWriter` to support appending and dynamic partitioning. The reason why + // we set it here is to setup the output committer class to `ParquetOutputCommitter`, which is + // bundled with `ParquetOutputFormat[Row]`. + job.setOutputFormatClass(classOf[ParquetOutputFormat[Row]]) + + ParquetOutputFormat.setWriteSupportClass(job, classOf[ParquetWriteSupport]) + + // This metadata is useful for keeping UDTs like Vector/Matrix. + ParquetWriteSupport.setSchema(dataSchema, conf) + + // Sets flags for `ParquetWriteSupport`, which converts Catalyst schema to Parquet + // schema and writes actual rows to Parquet files. + conf.set( + SQLConf.PARQUET_WRITE_LEGACY_FORMAT.key, + sqlConf.writeLegacyParquetFormat.toString + ) + + conf.set( + SQLConf.PARQUET_OUTPUT_TIMESTAMP_TYPE.key, + sqlConf.parquetOutputTimestampType.toString + ) + + // Sets compression scheme + conf.set( + ParquetOutputFormat.COMPRESSION, + parquetOptions.compressionCodecClassName + ) + + // ParquetOutputWriter required fields starting from 3.3.x + conf.set( + SQLConf.PARQUET_FIELD_ID_WRITE_ENABLED.key, + sqlConf.parquetFieldIdWriteEnabled.toString + ) + + // SPARK-15719: Disables writing Parquet summary files by default. + if ( + conf.get(ParquetOutputFormat.JOB_SUMMARY_LEVEL) == null + && conf.get(ParquetOutputFormat.ENABLE_JOB_SUMMARY) == null + ) { + conf.setEnum(ParquetOutputFormat.JOB_SUMMARY_LEVEL, JobSummaryLevel.NONE) + } + + if ( + ParquetOutputFormat.getJobSummaryLevel(conf) == JobSummaryLevel.NONE + && !classOf[ParquetOutputCommitter].isAssignableFrom(committerClass) + ) { + // output summary is requested, but the class is not a Parquet Committer + logWarning( + s"Committer $committerClass is not a ParquetOutputCommitter and cannot" + + s" create job summaries. " + + s"Set Parquet option ${ParquetOutputFormat.JOB_SUMMARY_LEVEL} to NONE." + ) + } + + new OutputWriterFactory { + override def newInstance( + path: String, + dataSchema: StructType, + context: TaskAttemptContext + ): OutputWriter = { + new ParquetOutputWriter(path, context) + } + + override def getFileExtension(context: TaskAttemptContext): String = { + CodecConfig.from(context).getCodec.getExtension + ".parquet" + } + } + } +} diff --git a/maven-projects/spark/graphar/pom.xml b/maven-projects/spark/graphar/pom.xml index 9dc46eb49..045b27fc1 100644 --- a/maven-projects/spark/graphar/pom.xml +++ b/maven-projects/spark/graphar/pom.xml @@ -90,7 +90,7 @@ org.neo4j - neo4j-connector-apache-spark_2.12 + neo4j-connector-apache-spark_${scala.binary.version} 5.0.0_for_spark_3 @@ -100,7 +100,7 @@ org.scala-lang.modules - scala-collection-compat_2.12 + scala-collection-compat_${scala.binary.version} 2.1.1 @@ -113,7 +113,7 @@ ${scala.version} - -target:jvm-1.8 + -target:jvm-${maven.compiler.target} -Xss4096K @@ -218,8 +218,8 @@ org.scalameta - semanticdb-scalac_2.12.10 - 4.3.24 + semanticdb-scalac_${scala.version} + ${semanticdb-scalac.version} diff --git a/maven-projects/spark/graphar/src/main/scala/org/apache/graphar/graph/GraphReader.scala b/maven-projects/spark/graphar/src/main/scala/org/apache/graphar/graph/GraphReader.scala index 3fbd56aa8..251e1b203 100644 --- a/maven-projects/spark/graphar/src/main/scala/org/apache/graphar/graph/GraphReader.scala +++ b/maven-projects/spark/graphar/src/main/scala/org/apache/graphar/graph/GraphReader.scala @@ -118,10 +118,13 @@ object GraphReader { def readWithGraphInfo( graphInfo: GraphInfo, spark: SparkSession - ): Pair[Map[String, DataFrame], Map[ - (String, String, String), - Map[String, DataFrame] - ]] = { + ): ( + Map[String, DataFrame], + Map[ + (String, String, String), + Map[String, DataFrame] + ] + ) = { val prefix = graphInfo.getPrefix val vertex_infos = graphInfo.getVertexInfos() val edge_infos = graphInfo.getEdgeInfos() @@ -148,10 +151,13 @@ object GraphReader { def read( graphInfoPath: String, spark: SparkSession - ): Pair[Map[String, DataFrame], Map[ - (String, String, String), - Map[String, DataFrame] - ]] = { + ): ( + Map[String, DataFrame], + Map[ + (String, String, String), + Map[String, DataFrame] + ] + ) = { // load graph info val graph_info = GraphInfo.loadGraphInfo(graphInfoPath, spark) diff --git a/maven-projects/spark/graphar/src/main/scala/org/apache/graphar/graph/GraphWriter.scala b/maven-projects/spark/graphar/src/main/scala/org/apache/graphar/graph/GraphWriter.scala index 18e7f649f..14098da56 100644 --- a/maven-projects/spark/graphar/src/main/scala/org/apache/graphar/graph/GraphWriter.scala +++ b/maven-projects/spark/graphar/src/main/scala/org/apache/graphar/graph/GraphWriter.scala @@ -281,11 +281,11 @@ class GraphWriter() { } val vertices: scala.collection.mutable.Map[String, DataFrame] = - scala.collection.mutable.Map[String, DataFrame]() + scala.collection.mutable.Map.empty val edges: scala.collection.mutable.Map[(String, String, String), DataFrame] = - scala.collection.mutable.Map[(String, String, String), DataFrame]() + scala.collection.mutable.Map.empty val vertexNums: scala.collection.mutable.Map[String, Long] = - scala.collection.mutable.Map[String, Long]() + scala.collection.mutable.Map.empty val primaryKeys: scala.collection.mutable.Map[String, String] = - scala.collection.mutable.Map[String, String]() + scala.collection.mutable.Map.empty } diff --git a/maven-projects/spark/graphar/src/main/scala/org/apache/graphar/writer/EdgeWriter.scala b/maven-projects/spark/graphar/src/main/scala/org/apache/graphar/writer/EdgeWriter.scala index b1e2e6748..d007d18c2 100644 --- a/maven-projects/spark/graphar/src/main/scala/org/apache/graphar/writer/EdgeWriter.scala +++ b/maven-projects/spark/graphar/src/main/scala/org/apache/graphar/writer/EdgeWriter.scala @@ -18,7 +18,6 @@ */ package org.apache.graphar.writer - import org.apache.graphar.util.{FileSystem, EdgeChunkPartitioner} import org.apache.graphar.{ GeneralParams, @@ -160,9 +159,9 @@ object EdgeWriter { val filterRDD = edgeCountsByPrimaryKey .filter(v => v._1 / vertexChunkSize == i) .map { case (k, v) => (k - i * vertexChunkSize + 1, v) } - val initRDD = spark.sparkContext.parallelize( - (0L to vertexChunkSize).map(key => (key, 0)) - ) + val initRDD = spark.sparkContext + .range(0L, vertexChunkSize + 1) + .map(key => (key, 0)) val unionRDD = spark.sparkContext .union(filterRDD, initRDD) .reduceByKey(_ + _) @@ -353,7 +352,8 @@ class EdgeWriter( val property = pIter.next() propertyList += "`" + property.getName() + "`" } - val propertyGroupDf = edgeDfAndOffsetDf._1.select(propertyList.map(col): _*) + val propertyGroupDf = + edgeDfAndOffsetDf._1.select(propertyList.map(col).toSeq: _*) val outputPrefix = prefix + edgeInfo.getPropertyGroupPathPrefix(propertyGroup, adjListType) FileSystem.writeDataFrame( diff --git a/maven-projects/spark/graphar/src/main/scala/org/apache/graphar/writer/VertexWriter.scala b/maven-projects/spark/graphar/src/main/scala/org/apache/graphar/writer/VertexWriter.scala index d6e8483fd..dda261146 100644 --- a/maven-projects/spark/graphar/src/main/scala/org/apache/graphar/writer/VertexWriter.scala +++ b/maven-projects/spark/graphar/src/main/scala/org/apache/graphar/writer/VertexWriter.scala @@ -137,7 +137,7 @@ class VertexWriter( val property = it.next() property_list += "`" + property.getName() + "`" } - val pg_df = chunks.select(property_list.map(col): _*) + val pg_df = chunks.select(property_list.map(col).toSeq: _*) FileSystem.writeDataFrame( pg_df, propertyGroup.getFile_type(), diff --git a/maven-projects/spark/graphar/src/test/scala/org/apache/graphar/TestReader.scala b/maven-projects/spark/graphar/src/test/scala/org/apache/graphar/TestReader.scala index f61710b95..9c52f75ee 100644 --- a/maven-projects/spark/graphar/src/test/scala/org/apache/graphar/TestReader.scala +++ b/maven-projects/spark/graphar/src/test/scala/org/apache/graphar/TestReader.scala @@ -20,6 +20,7 @@ package org.apache.graphar import org.apache.graphar.reader.{VertexReader, EdgeReader} +import org.scalatest.matchers.should.Matchers._ class ReaderSuite extends BaseTestSuite { @@ -57,7 +58,9 @@ class ReaderSuite extends BaseTestSuite { .format("org.apache.graphar.datasources.GarDataSource") .load(orc_read_path) // validate reading results - assert(df2.rdd.collect().deep == df1.rdd.collect().deep) + val left = df2.rdd.collect() + val right = df1.rdd.collect() + left should contain theSameElementsAs right df_pd = df1.filter(cond) /** diff --git a/maven-projects/spark/pom.xml b/maven-projects/spark/pom.xml index 455fb1754..72e6c0854 100644 --- a/maven-projects/spark/pom.xml +++ b/maven-projects/spark/pom.xml @@ -35,48 +35,67 @@ pom ${graphar.version} + + 2.12.15 + 2.12 + 512m + 1024m + 1.8 + 1.8 + 4.8.15 + UTF-8 + UTF-8 + 2.15.4 + + datasources-32 graphar - UTF-8 - UTF-8 - 2.12.10 - 2.12 - 512m - 1024m - 3.2.2 - 1.8 - 1.8 + 3.2.4 graphar datasources-32 - - true - datasources-33 graphar - UTF-8 - UTF-8 - 2.12.12 - 2.12 - 512m - 1024m 3.3.4 - 1.8 - 1.8 graphar datasources-33 + + datasources-34 + + graphar + 3.4.3 + + + graphar + datasources-34 + + + + datasources-35 + + graphar + 3.5.1 + + + graphar + datasources-35 + + + true + + @@ -108,7 +127,7 @@ ${scala.version} - -target:jvm-1.8 + -target:jvm-${maven.compiler.target} -Xss4096K diff --git a/pyspark/Makefile b/pyspark/Makefile index daea2b86a..ceac363df 100644 --- a/pyspark/Makefile +++ b/pyspark/Makefile @@ -19,7 +19,7 @@ install_test: export JAVA_HOME=${JAVA_HOME_11_X64} cd ../maven-projects/spark && mvn --no-transfer-progress clean package -DskipTests -Dspotless.check.skip=true && cd ../../pyspark - export PYSPARK_HADOOP_VERSION=3.2 + export PYSPARK_HADOOP_VERSION=3.3 poetry install --with=spark,tests .PHONY: test diff --git a/pyspark/poetry.lock b/pyspark/poetry.lock index 48f533d1b..78abd7916 100644 --- a/pyspark/poetry.lock +++ b/pyspark/poetry.lock @@ -13,63 +13,83 @@ files = [ [[package]] name = "coverage" -version = "7.4.4" +version = "7.6.1" description = "Code coverage measurement for Python" optional = false python-versions = ">=3.8" files = [ - {file = "coverage-7.4.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e0be5efd5127542ef31f165de269f77560d6cdef525fffa446de6f7e9186cfb2"}, - {file = "coverage-7.4.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ccd341521be3d1b3daeb41960ae94a5e87abe2f46f17224ba5d6f2b8398016cf"}, - {file = "coverage-7.4.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:09fa497a8ab37784fbb20ab699c246053ac294d13fc7eb40ec007a5043ec91f8"}, - {file = "coverage-7.4.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b1a93009cb80730c9bca5d6d4665494b725b6e8e157c1cb7f2db5b4b122ea562"}, - {file = "coverage-7.4.4-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:690db6517f09336559dc0b5f55342df62370a48f5469fabf502db2c6d1cffcd2"}, - {file = "coverage-7.4.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:09c3255458533cb76ef55da8cc49ffab9e33f083739c8bd4f58e79fecfe288f7"}, - {file = "coverage-7.4.4-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:8ce1415194b4a6bd0cdcc3a1dfbf58b63f910dcb7330fe15bdff542c56949f87"}, - {file = "coverage-7.4.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:b91cbc4b195444e7e258ba27ac33769c41b94967919f10037e6355e998af255c"}, - {file = "coverage-7.4.4-cp310-cp310-win32.whl", hash = "sha256:598825b51b81c808cb6f078dcb972f96af96b078faa47af7dfcdf282835baa8d"}, - {file = "coverage-7.4.4-cp310-cp310-win_amd64.whl", hash = "sha256:09ef9199ed6653989ebbcaacc9b62b514bb63ea2f90256e71fea3ed74bd8ff6f"}, - {file = "coverage-7.4.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:0f9f50e7ef2a71e2fae92774c99170eb8304e3fdf9c8c3c7ae9bab3e7229c5cf"}, - {file = "coverage-7.4.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:623512f8ba53c422fcfb2ce68362c97945095b864cda94a92edbaf5994201083"}, - {file = "coverage-7.4.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0513b9508b93da4e1716744ef6ebc507aff016ba115ffe8ecff744d1322a7b63"}, - {file = "coverage-7.4.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:40209e141059b9370a2657c9b15607815359ab3ef9918f0196b6fccce8d3230f"}, - {file = "coverage-7.4.4-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8a2b2b78c78293782fd3767d53e6474582f62443d0504b1554370bde86cc8227"}, - {file = "coverage-7.4.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:73bfb9c09951125d06ee473bed216e2c3742f530fc5acc1383883125de76d9cd"}, - {file = "coverage-7.4.4-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:1f384c3cc76aeedce208643697fb3e8437604b512255de6d18dae3f27655a384"}, - {file = "coverage-7.4.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:54eb8d1bf7cacfbf2a3186019bcf01d11c666bd495ed18717162f7eb1e9dd00b"}, - {file = "coverage-7.4.4-cp311-cp311-win32.whl", hash = "sha256:cac99918c7bba15302a2d81f0312c08054a3359eaa1929c7e4b26ebe41e9b286"}, - {file = "coverage-7.4.4-cp311-cp311-win_amd64.whl", hash = "sha256:b14706df8b2de49869ae03a5ccbc211f4041750cd4a66f698df89d44f4bd30ec"}, - {file = "coverage-7.4.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:201bef2eea65e0e9c56343115ba3814e896afe6d36ffd37bab783261db430f76"}, - {file = "coverage-7.4.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:41c9c5f3de16b903b610d09650e5e27adbfa7f500302718c9ffd1c12cf9d6818"}, - {file = "coverage-7.4.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d898fe162d26929b5960e4e138651f7427048e72c853607f2b200909794ed978"}, - {file = "coverage-7.4.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3ea79bb50e805cd6ac058dfa3b5c8f6c040cb87fe83de10845857f5535d1db70"}, - {file = "coverage-7.4.4-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ce4b94265ca988c3f8e479e741693d143026632672e3ff924f25fab50518dd51"}, - {file = "coverage-7.4.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:00838a35b882694afda09f85e469c96367daa3f3f2b097d846a7216993d37f4c"}, - {file = "coverage-7.4.4-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:fdfafb32984684eb03c2d83e1e51f64f0906b11e64482df3c5db936ce3839d48"}, - {file = "coverage-7.4.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:69eb372f7e2ece89f14751fbcbe470295d73ed41ecd37ca36ed2eb47512a6ab9"}, - {file = "coverage-7.4.4-cp312-cp312-win32.whl", hash = "sha256:137eb07173141545e07403cca94ab625cc1cc6bc4c1e97b6e3846270e7e1fea0"}, - {file = "coverage-7.4.4-cp312-cp312-win_amd64.whl", hash = "sha256:d71eec7d83298f1af3326ce0ff1d0ea83c7cb98f72b577097f9083b20bdaf05e"}, - {file = "coverage-7.4.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:d5ae728ff3b5401cc320d792866987e7e7e880e6ebd24433b70a33b643bb0384"}, - {file = "coverage-7.4.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:cc4f1358cb0c78edef3ed237ef2c86056206bb8d9140e73b6b89fbcfcbdd40e1"}, - {file = "coverage-7.4.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8130a2aa2acb8788e0b56938786c33c7c98562697bf9f4c7d6e8e5e3a0501e4a"}, - {file = "coverage-7.4.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cf271892d13e43bc2b51e6908ec9a6a5094a4df1d8af0bfc360088ee6c684409"}, - {file = "coverage-7.4.4-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a4cdc86d54b5da0df6d3d3a2f0b710949286094c3a6700c21e9015932b81447e"}, - {file = "coverage-7.4.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:ae71e7ddb7a413dd60052e90528f2f65270aad4b509563af6d03d53e979feafd"}, - {file = "coverage-7.4.4-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:38dd60d7bf242c4ed5b38e094baf6401faa114fc09e9e6632374388a404f98e7"}, - {file = "coverage-7.4.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:aa5b1c1bfc28384f1f53b69a023d789f72b2e0ab1b3787aae16992a7ca21056c"}, - {file = "coverage-7.4.4-cp38-cp38-win32.whl", hash = "sha256:dfa8fe35a0bb90382837b238fff375de15f0dcdb9ae68ff85f7a63649c98527e"}, - {file = "coverage-7.4.4-cp38-cp38-win_amd64.whl", hash = "sha256:b2991665420a803495e0b90a79233c1433d6ed77ef282e8e152a324bbbc5e0c8"}, - {file = "coverage-7.4.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:3b799445b9f7ee8bf299cfaed6f5b226c0037b74886a4e11515e569b36fe310d"}, - {file = "coverage-7.4.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b4d33f418f46362995f1e9d4f3a35a1b6322cb959c31d88ae56b0298e1c22357"}, - {file = "coverage-7.4.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aadacf9a2f407a4688d700e4ebab33a7e2e408f2ca04dbf4aef17585389eff3e"}, - {file = "coverage-7.4.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7c95949560050d04d46b919301826525597f07b33beba6187d04fa64d47ac82e"}, - {file = "coverage-7.4.4-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ff7687ca3d7028d8a5f0ebae95a6e4827c5616b31a4ee1192bdfde697db110d4"}, - {file = "coverage-7.4.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:5fc1de20b2d4a061b3df27ab9b7c7111e9a710f10dc2b84d33a4ab25065994ec"}, - {file = "coverage-7.4.4-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:c74880fc64d4958159fbd537a091d2a585448a8f8508bf248d72112723974cbd"}, - {file = "coverage-7.4.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:742a76a12aa45b44d236815d282b03cfb1de3b4323f3e4ec933acfae08e54ade"}, - {file = "coverage-7.4.4-cp39-cp39-win32.whl", hash = "sha256:d89d7b2974cae412400e88f35d86af72208e1ede1a541954af5d944a8ba46c57"}, - {file = "coverage-7.4.4-cp39-cp39-win_amd64.whl", hash = "sha256:9ca28a302acb19b6af89e90f33ee3e1906961f94b54ea37de6737b7ca9d8827c"}, - {file = "coverage-7.4.4-pp38.pp39.pp310-none-any.whl", hash = "sha256:b2c5edc4ac10a7ef6605a966c58929ec6c1bd0917fb8c15cb3363f65aa40e677"}, - {file = "coverage-7.4.4.tar.gz", hash = "sha256:c901df83d097649e257e803be22592aedfd5182f07b3cc87d640bbb9afd50f49"}, + {file = "coverage-7.6.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b06079abebbc0e89e6163b8e8f0e16270124c154dc6e4a47b413dd538859af16"}, + {file = "coverage-7.6.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:cf4b19715bccd7ee27b6b120e7e9dd56037b9c0681dcc1adc9ba9db3d417fa36"}, + {file = "coverage-7.6.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e61c0abb4c85b095a784ef23fdd4aede7a2628478e7baba7c5e3deba61070a02"}, + {file = "coverage-7.6.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fd21f6ae3f08b41004dfb433fa895d858f3f5979e7762d052b12aef444e29afc"}, + {file = "coverage-7.6.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f59d57baca39b32db42b83b2a7ba6f47ad9c394ec2076b084c3f029b7afca23"}, + {file = "coverage-7.6.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:a1ac0ae2b8bd743b88ed0502544847c3053d7171a3cff9228af618a068ed9c34"}, + {file = "coverage-7.6.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:e6a08c0be454c3b3beb105c0596ebdc2371fab6bb90c0c0297f4e58fd7e1012c"}, + {file = "coverage-7.6.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:f5796e664fe802da4f57a168c85359a8fbf3eab5e55cd4e4569fbacecc903959"}, + {file = "coverage-7.6.1-cp310-cp310-win32.whl", hash = "sha256:7bb65125fcbef8d989fa1dd0e8a060999497629ca5b0efbca209588a73356232"}, + {file = "coverage-7.6.1-cp310-cp310-win_amd64.whl", hash = "sha256:3115a95daa9bdba70aea750db7b96b37259a81a709223c8448fa97727d546fe0"}, + {file = "coverage-7.6.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:7dea0889685db8550f839fa202744652e87c60015029ce3f60e006f8c4462c93"}, + {file = "coverage-7.6.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ed37bd3c3b063412f7620464a9ac1314d33100329f39799255fb8d3027da50d3"}, + {file = "coverage-7.6.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d85f5e9a5f8b73e2350097c3756ef7e785f55bd71205defa0bfdaf96c31616ff"}, + {file = "coverage-7.6.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9bc572be474cafb617672c43fe989d6e48d3c83af02ce8de73fff1c6bb3c198d"}, + {file = "coverage-7.6.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0c0420b573964c760df9e9e86d1a9a622d0d27f417e1a949a8a66dd7bcee7bc6"}, + {file = "coverage-7.6.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1f4aa8219db826ce6be7099d559f8ec311549bfc4046f7f9fe9b5cea5c581c56"}, + {file = "coverage-7.6.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:fc5a77d0c516700ebad189b587de289a20a78324bc54baee03dd486f0855d234"}, + {file = "coverage-7.6.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:b48f312cca9621272ae49008c7f613337c53fadca647d6384cc129d2996d1133"}, + {file = "coverage-7.6.1-cp311-cp311-win32.whl", hash = "sha256:1125ca0e5fd475cbbba3bb67ae20bd2c23a98fac4e32412883f9bcbaa81c314c"}, + {file = "coverage-7.6.1-cp311-cp311-win_amd64.whl", hash = "sha256:8ae539519c4c040c5ffd0632784e21b2f03fc1340752af711f33e5be83a9d6c6"}, + {file = "coverage-7.6.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:95cae0efeb032af8458fc27d191f85d1717b1d4e49f7cb226cf526ff28179778"}, + {file = "coverage-7.6.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:5621a9175cf9d0b0c84c2ef2b12e9f5f5071357c4d2ea6ca1cf01814f45d2391"}, + {file = "coverage-7.6.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:260933720fdcd75340e7dbe9060655aff3af1f0c5d20f46b57f262ab6c86a5e8"}, + {file = "coverage-7.6.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:07e2ca0ad381b91350c0ed49d52699b625aab2b44b65e1b4e02fa9df0e92ad2d"}, + {file = "coverage-7.6.1-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c44fee9975f04b33331cb8eb272827111efc8930cfd582e0320613263ca849ca"}, + {file = "coverage-7.6.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:877abb17e6339d96bf08e7a622d05095e72b71f8afd8a9fefc82cf30ed944163"}, + {file = "coverage-7.6.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:3e0cadcf6733c09154b461f1ca72d5416635e5e4ec4e536192180d34ec160f8a"}, + {file = "coverage-7.6.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:c3c02d12f837d9683e5ab2f3d9844dc57655b92c74e286c262e0fc54213c216d"}, + {file = "coverage-7.6.1-cp312-cp312-win32.whl", hash = "sha256:e05882b70b87a18d937ca6768ff33cc3f72847cbc4de4491c8e73880766718e5"}, + {file = "coverage-7.6.1-cp312-cp312-win_amd64.whl", hash = "sha256:b5d7b556859dd85f3a541db6a4e0167b86e7273e1cdc973e5b175166bb634fdb"}, + {file = "coverage-7.6.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:a4acd025ecc06185ba2b801f2de85546e0b8ac787cf9d3b06e7e2a69f925b106"}, + {file = "coverage-7.6.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a6d3adcf24b624a7b778533480e32434a39ad8fa30c315208f6d3e5542aeb6e9"}, + {file = "coverage-7.6.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d0c212c49b6c10e6951362f7c6df3329f04c2b1c28499563d4035d964ab8e08c"}, + {file = "coverage-7.6.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6e81d7a3e58882450ec4186ca59a3f20a5d4440f25b1cff6f0902ad890e6748a"}, + {file = "coverage-7.6.1-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:78b260de9790fd81e69401c2dc8b17da47c8038176a79092a89cb2b7d945d060"}, + {file = "coverage-7.6.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a78d169acd38300060b28d600344a803628c3fd585c912cacc9ea8790fe96862"}, + {file = "coverage-7.6.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:2c09f4ce52cb99dd7505cd0fc8e0e37c77b87f46bc9c1eb03fe3bc9991085388"}, + {file = "coverage-7.6.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6878ef48d4227aace338d88c48738a4258213cd7b74fd9a3d4d7582bb1d8a155"}, + {file = "coverage-7.6.1-cp313-cp313-win32.whl", hash = "sha256:44df346d5215a8c0e360307d46ffaabe0f5d3502c8a1cefd700b34baf31d411a"}, + {file = "coverage-7.6.1-cp313-cp313-win_amd64.whl", hash = "sha256:8284cf8c0dd272a247bc154eb6c95548722dce90d098c17a883ed36e67cdb129"}, + {file = "coverage-7.6.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:d3296782ca4eab572a1a4eca686d8bfb00226300dcefdf43faa25b5242ab8a3e"}, + {file = "coverage-7.6.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:502753043567491d3ff6d08629270127e0c31d4184c4c8d98f92c26f65019962"}, + {file = "coverage-7.6.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6a89ecca80709d4076b95f89f308544ec8f7b4727e8a547913a35f16717856cb"}, + {file = "coverage-7.6.1-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a318d68e92e80af8b00fa99609796fdbcdfef3629c77c6283566c6f02c6d6704"}, + {file = "coverage-7.6.1-cp313-cp313t-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:13b0a73a0896988f053e4fbb7de6d93388e6dd292b0d87ee51d106f2c11b465b"}, + {file = "coverage-7.6.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:4421712dbfc5562150f7554f13dde997a2e932a6b5f352edcce948a815efee6f"}, + {file = "coverage-7.6.1-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:166811d20dfea725e2e4baa71fffd6c968a958577848d2131f39b60043400223"}, + {file = "coverage-7.6.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:225667980479a17db1048cb2bf8bfb39b8e5be8f164b8f6628b64f78a72cf9d3"}, + {file = "coverage-7.6.1-cp313-cp313t-win32.whl", hash = "sha256:170d444ab405852903b7d04ea9ae9b98f98ab6d7e63e1115e82620807519797f"}, + {file = "coverage-7.6.1-cp313-cp313t-win_amd64.whl", hash = "sha256:b9f222de8cded79c49bf184bdbc06630d4c58eec9459b939b4a690c82ed05657"}, + {file = "coverage-7.6.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:6db04803b6c7291985a761004e9060b2bca08da6d04f26a7f2294b8623a0c1a0"}, + {file = "coverage-7.6.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:f1adfc8ac319e1a348af294106bc6a8458a0f1633cc62a1446aebc30c5fa186a"}, + {file = "coverage-7.6.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a95324a9de9650a729239daea117df21f4b9868ce32e63f8b650ebe6cef5595b"}, + {file = "coverage-7.6.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b43c03669dc4618ec25270b06ecd3ee4fa94c7f9b3c14bae6571ca00ef98b0d3"}, + {file = "coverage-7.6.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8929543a7192c13d177b770008bc4e8119f2e1f881d563fc6b6305d2d0ebe9de"}, + {file = "coverage-7.6.1-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:a09ece4a69cf399510c8ab25e0950d9cf2b42f7b3cb0374f95d2e2ff594478a6"}, + {file = "coverage-7.6.1-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:9054a0754de38d9dbd01a46621636689124d666bad1936d76c0341f7d71bf569"}, + {file = "coverage-7.6.1-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:0dbde0f4aa9a16fa4d754356a8f2e36296ff4d83994b2c9d8398aa32f222f989"}, + {file = "coverage-7.6.1-cp38-cp38-win32.whl", hash = "sha256:da511e6ad4f7323ee5702e6633085fb76c2f893aaf8ce4c51a0ba4fc07580ea7"}, + {file = "coverage-7.6.1-cp38-cp38-win_amd64.whl", hash = "sha256:3f1156e3e8f2872197af3840d8ad307a9dd18e615dc64d9ee41696f287c57ad8"}, + {file = "coverage-7.6.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:abd5fd0db5f4dc9289408aaf34908072f805ff7792632250dcb36dc591d24255"}, + {file = "coverage-7.6.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:547f45fa1a93154bd82050a7f3cddbc1a7a4dd2a9bf5cb7d06f4ae29fe94eaf8"}, + {file = "coverage-7.6.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:645786266c8f18a931b65bfcefdbf6952dd0dea98feee39bd188607a9d307ed2"}, + {file = "coverage-7.6.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9e0b2df163b8ed01d515807af24f63de04bebcecbd6c3bfeff88385789fdf75a"}, + {file = "coverage-7.6.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:609b06f178fe8e9f89ef676532760ec0b4deea15e9969bf754b37f7c40326dbc"}, + {file = "coverage-7.6.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:702855feff378050ae4f741045e19a32d57d19f3e0676d589df0575008ea5004"}, + {file = "coverage-7.6.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:2bdb062ea438f22d99cba0d7829c2ef0af1d768d1e4a4f528087224c90b132cb"}, + {file = "coverage-7.6.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:9c56863d44bd1c4fe2abb8a4d6f5371d197f1ac0ebdee542f07f35895fc07f36"}, + {file = "coverage-7.6.1-cp39-cp39-win32.whl", hash = "sha256:6e2cd258d7d927d09493c8df1ce9174ad01b381d4729a9d8d4e38670ca24774c"}, + {file = "coverage-7.6.1-cp39-cp39-win_amd64.whl", hash = "sha256:06a737c882bd26d0d6ee7269b20b12f14a8704807a01056c80bb881a4b2ce6ca"}, + {file = "coverage-7.6.1-pp38.pp39.pp310-none-any.whl", hash = "sha256:e9a6e0eb86070e8ccaedfbd9d38fec54864f3125ab95419970575b42af7541df"}, + {file = "coverage-7.6.1.tar.gz", hash = "sha256:953510dfb7b12ab69d20135a0662397f077c59b1e6379a768e97c59d852ee51d"}, ] [package.dependencies] @@ -80,13 +100,13 @@ toml = ["tomli"] [[package]] name = "exceptiongroup" -version = "1.2.0" +version = "1.2.2" description = "Backport of PEP 654 (exception groups)" optional = false python-versions = ">=3.7" files = [ - {file = "exceptiongroup-1.2.0-py3-none-any.whl", hash = "sha256:4bfd3996ac73b41e9b9628b04e079f193850720ea5945fc96a08633c66912f14"}, - {file = "exceptiongroup-1.2.0.tar.gz", hash = "sha256:91f5c769735f051a4290d52edd0858999b57e5876e9f85937691bd4c9fa3ed68"}, + {file = "exceptiongroup-1.2.2-py3-none-any.whl", hash = "sha256:3111b9d131c238bec2f8f516e123e14ba243563fb135d3fe885990585aa7795b"}, + {file = "exceptiongroup-1.2.2.tar.gz", hash = "sha256:47c2edf7c6738fafb49fd34290706d1a1a2f4d1c6df275526b62cbb4aa5393cc"}, ] [package.extras] @@ -105,13 +125,13 @@ files = [ [[package]] name = "jinja2" -version = "3.1.3" +version = "3.1.4" description = "A very fast and expressive template engine." optional = false python-versions = ">=3.7" files = [ - {file = "Jinja2-3.1.3-py3-none-any.whl", hash = "sha256:7d6d50dd97d52cbc355597bd845fabfbac3f551e1f99619e39a35ce8c370b5fa"}, - {file = "Jinja2-3.1.3.tar.gz", hash = "sha256:ac8bd6544d4bb2c9792bf3a159e80bba8fda7f07e81bc3aed565432d5925ba90"}, + {file = "jinja2-3.1.4-py3-none-any.whl", hash = "sha256:bc5dd2abb727a5319567b7a813e6a2e7318c39f4f487cfe6c89c6f9c7d25197d"}, + {file = "jinja2-3.1.4.tar.gz", hash = "sha256:4a3aee7acbbe7303aede8e9648d13b8bf88a429282aa6122a993f0ac800cb369"}, ] [package.dependencies] @@ -191,24 +211,24 @@ files = [ [[package]] name = "packaging" -version = "24.0" +version = "24.1" description = "Core utilities for Python packages" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "packaging-24.0-py3-none-any.whl", hash = "sha256:2ddfb553fdf02fb784c234c7ba6ccc288296ceabec964ad2eae3777778130bc5"}, - {file = "packaging-24.0.tar.gz", hash = "sha256:eb82c5e3e56209074766e6885bb04b8c38a0c015d0a30036ebe7ece34c9989e9"}, + {file = "packaging-24.1-py3-none-any.whl", hash = "sha256:5b8f2217dbdbd2f7f384c41c628544e6d52f2d0f53c6d0c3ea61aa5d1d7ff124"}, + {file = "packaging-24.1.tar.gz", hash = "sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002"}, ] [[package]] name = "pdoc" -version = "14.4.0" +version = "14.6.0" description = "API Documentation for Python Projects" optional = false python-versions = ">=3.8" files = [ - {file = "pdoc-14.4.0-py3-none-any.whl", hash = "sha256:6ea4fe07620b1f7601e2708a307a257636ec206e20b5611640b30f2e3cab47d6"}, - {file = "pdoc-14.4.0.tar.gz", hash = "sha256:c92edc425429ccbe287ace2a027953c24f13de53eab484c1a6d31ca72dd2fda9"}, + {file = "pdoc-14.6.0-py3-none-any.whl", hash = "sha256:36c42c546a317d8e3e8c0b39645f24161374de0c7066ccaae76628d721e49ba5"}, + {file = "pdoc-14.6.0.tar.gz", hash = "sha256:6e98a24c5e0ca5d188397969cf82581836eaef13f172fc3820047bfe15c61c9a"}, ] [package.dependencies] @@ -221,13 +241,13 @@ dev = ["hypothesis", "mypy", "pdoc-pyo3-sample-library (==1.0.11)", "pygments (> [[package]] name = "pluggy" -version = "1.4.0" +version = "1.5.0" description = "plugin and hook calling mechanisms for python" optional = false python-versions = ">=3.8" files = [ - {file = "pluggy-1.4.0-py3-none-any.whl", hash = "sha256:7db9f7b503d67d1c5b95f59773ebb58a8c1c288129a88665838012cfb07b8981"}, - {file = "pluggy-1.4.0.tar.gz", hash = "sha256:8c85c2876142a764e5b7548e7d9a0e0ddb46f5185161049a79b7e974454223be"}, + {file = "pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669"}, + {file = "pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1"}, ] [package.extras] @@ -236,58 +256,58 @@ testing = ["pytest", "pytest-benchmark"] [[package]] name = "py4j" -version = "0.10.9.5" +version = "0.10.9.7" description = "Enables Python programs to dynamically access arbitrary Java objects" optional = false python-versions = "*" files = [ - {file = "py4j-0.10.9.5-py2.py3-none-any.whl", hash = "sha256:52d171a6a2b031d8a5d1de6efe451cf4f5baff1a2819aabc3741c8406539ba04"}, - {file = "py4j-0.10.9.5.tar.gz", hash = "sha256:276a4a3c5a2154df1860ef3303a927460e02e97b047dc0a47c1c3fb8cce34db6"}, + {file = "py4j-0.10.9.7-py2.py3-none-any.whl", hash = "sha256:85defdfd2b2376eb3abf5ca6474b51ab7e0de341c75a02f46dc9b5976f5a5c1b"}, + {file = "py4j-0.10.9.7.tar.gz", hash = "sha256:0b6e5315bb3ada5cf62ac651d107bb2ebc02def3dee9d9548e3baac644ea8dbb"}, ] [[package]] name = "pygments" -version = "2.17.2" +version = "2.18.0" description = "Pygments is a syntax highlighting package written in Python." optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "pygments-2.17.2-py3-none-any.whl", hash = "sha256:b27c2826c47d0f3219f29554824c30c5e8945175d888647acd804ddd04af846c"}, - {file = "pygments-2.17.2.tar.gz", hash = "sha256:da46cec9fd2de5be3a8a784f434e4c4ab670b4ff54d605c4c2717e9d49c4c367"}, + {file = "pygments-2.18.0-py3-none-any.whl", hash = "sha256:b8e6aca0523f3ab76fee51799c488e38782ac06eafcf95e7ba832985c8e7b13a"}, + {file = "pygments-2.18.0.tar.gz", hash = "sha256:786ff802f32e91311bff3889f6e9a86e81505fe99f2735bb6d60ae0c5004f199"}, ] [package.extras] -plugins = ["importlib-metadata"] windows-terminal = ["colorama (>=0.4.6)"] [[package]] name = "pyspark" -version = "3.2.2" +version = "3.5.1" description = "Apache Spark Python API" optional = false -python-versions = ">=3.6" +python-versions = ">=3.8" files = [ - {file = "pyspark-3.2.2.tar.gz", hash = "sha256:5455214cf0b83d4a184cda25ca3b0812481915353b180cf7d7ac227728a4d99e"}, + {file = "pyspark-3.5.1.tar.gz", hash = "sha256:dd6569e547365eadc4f887bf57f153e4d582a68c4b490de475d55b9981664910"}, ] [package.dependencies] -py4j = "0.10.9.5" +py4j = "0.10.9.7" [package.extras] -ml = ["numpy (>=1.7)"] -mllib = ["numpy (>=1.7)"] -pandas-on-spark = ["numpy (>=1.14)", "pandas (>=0.23.2)", "pyarrow (>=1.0.0)"] -sql = ["pandas (>=0.23.2)", "pyarrow (>=1.0.0)"] +connect = ["googleapis-common-protos (>=1.56.4)", "grpcio (>=1.56.0)", "grpcio-status (>=1.56.0)", "numpy (>=1.15)", "pandas (>=1.0.5)", "pyarrow (>=4.0.0)"] +ml = ["numpy (>=1.15)"] +mllib = ["numpy (>=1.15)"] +pandas-on-spark = ["numpy (>=1.15)", "pandas (>=1.0.5)", "pyarrow (>=4.0.0)"] +sql = ["numpy (>=1.15)", "pandas (>=1.0.5)", "pyarrow (>=4.0.0)"] [[package]] name = "pytest" -version = "8.1.1" +version = "8.3.2" description = "pytest: simple powerful testing with Python" optional = false python-versions = ">=3.8" files = [ - {file = "pytest-8.1.1-py3-none-any.whl", hash = "sha256:2a8386cfc11fa9d2c50ee7b2a57e7d898ef90470a7a34c4b949ff59662bb78b7"}, - {file = "pytest-8.1.1.tar.gz", hash = "sha256:ac978141a75948948817d360297b7aae0fcb9d6ff6bc9ec6d514b85d5a65c044"}, + {file = "pytest-8.3.2-py3-none-any.whl", hash = "sha256:4ba08f9ae7dcf84ded419494d229b48d0903ea6407b030eaec46df5e6a73bba5"}, + {file = "pytest-8.3.2.tar.gz", hash = "sha256:c132345d12ce551242c87269de812483f5bcc87cdbb4722e48487ba194f9fdce"}, ] [package.dependencies] @@ -295,11 +315,11 @@ colorama = {version = "*", markers = "sys_platform == \"win32\""} exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""} iniconfig = "*" packaging = "*" -pluggy = ">=1.4,<2.0" +pluggy = ">=1.5,<2" tomli = {version = ">=1", markers = "python_version < \"3.11\""} [package.extras] -testing = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] +dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] [[package]] name = "pytest-cov" @@ -321,88 +341,91 @@ testing = ["fields", "hunter", "process-tests", "pytest-xdist", "virtualenv"] [[package]] name = "pyyaml" -version = "6.0.1" +version = "6.0.2" description = "YAML parser and emitter for Python" optional = false -python-versions = ">=3.6" +python-versions = ">=3.8" files = [ - {file = "PyYAML-6.0.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d858aa552c999bc8a8d57426ed01e40bef403cd8ccdd0fc5f6f04a00414cac2a"}, - {file = "PyYAML-6.0.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:fd66fc5d0da6d9815ba2cebeb4205f95818ff4b79c3ebe268e75d961704af52f"}, - {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938"}, - {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d"}, - {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515"}, - {file = "PyYAML-6.0.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290"}, - {file = "PyYAML-6.0.1-cp310-cp310-win32.whl", hash = "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924"}, - {file = "PyYAML-6.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d"}, - {file = "PyYAML-6.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007"}, - {file = "PyYAML-6.0.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f003ed9ad21d6a4713f0a9b5a7a0a79e08dd0f221aff4525a2be4c346ee60aab"}, - {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d"}, - {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc"}, - {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673"}, - {file = "PyYAML-6.0.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b"}, - {file = "PyYAML-6.0.1-cp311-cp311-win32.whl", hash = "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741"}, - {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, - {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, - {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, - {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"}, - {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, - {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, - {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, - {file = "PyYAML-6.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df"}, - {file = "PyYAML-6.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47"}, - {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98"}, - {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c"}, - {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:afd7e57eddb1a54f0f1a974bc4391af8bcce0b444685d936840f125cf046d5bd"}, - {file = "PyYAML-6.0.1-cp36-cp36m-win32.whl", hash = "sha256:fca0e3a251908a499833aa292323f32437106001d436eca0e6e7833256674585"}, - {file = "PyYAML-6.0.1-cp36-cp36m-win_amd64.whl", hash = "sha256:f22ac1c3cac4dbc50079e965eba2c1058622631e526bd9afd45fedd49ba781fa"}, - {file = "PyYAML-6.0.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:b1275ad35a5d18c62a7220633c913e1b42d44b46ee12554e5fd39c70a243d6a3"}, - {file = "PyYAML-6.0.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:18aeb1bf9a78867dc38b259769503436b7c72f7a1f1f4c93ff9a17de54319b27"}, - {file = "PyYAML-6.0.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:596106435fa6ad000c2991a98fa58eeb8656ef2325d7e158344fb33864ed87e3"}, - {file = "PyYAML-6.0.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:baa90d3f661d43131ca170712d903e6295d1f7a0f595074f151c0aed377c9b9c"}, - {file = "PyYAML-6.0.1-cp37-cp37m-win32.whl", hash = "sha256:9046c58c4395dff28dd494285c82ba00b546adfc7ef001486fbf0324bc174fba"}, - {file = "PyYAML-6.0.1-cp37-cp37m-win_amd64.whl", hash = "sha256:4fb147e7a67ef577a588a0e2c17b6db51dda102c71de36f8549b6816a96e1867"}, - {file = "PyYAML-6.0.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1d4c7e777c441b20e32f52bd377e0c409713e8bb1386e1099c2415f26e479595"}, - {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5"}, - {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696"}, - {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735"}, - {file = "PyYAML-6.0.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6"}, - {file = "PyYAML-6.0.1-cp38-cp38-win32.whl", hash = "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206"}, - {file = "PyYAML-6.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62"}, - {file = "PyYAML-6.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8"}, - {file = "PyYAML-6.0.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c8098ddcc2a85b61647b2590f825f3db38891662cfc2fc776415143f599bb859"}, - {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6"}, - {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0"}, - {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c"}, - {file = "PyYAML-6.0.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5"}, - {file = "PyYAML-6.0.1-cp39-cp39-win32.whl", hash = "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c"}, - {file = "PyYAML-6.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486"}, - {file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"}, + {file = "PyYAML-6.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0a9a2848a5b7feac301353437eb7d5957887edbf81d56e903999a75a3d743086"}, + {file = "PyYAML-6.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:29717114e51c84ddfba879543fb232a6ed60086602313ca38cce623c1d62cfbf"}, + {file = "PyYAML-6.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8824b5a04a04a047e72eea5cec3bc266db09e35de6bdfe34c9436ac5ee27d237"}, + {file = "PyYAML-6.0.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7c36280e6fb8385e520936c3cb3b8042851904eba0e58d277dca80a5cfed590b"}, + {file = "PyYAML-6.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec031d5d2feb36d1d1a24380e4db6d43695f3748343d99434e6f5f9156aaa2ed"}, + {file = "PyYAML-6.0.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:936d68689298c36b53b29f23c6dbb74de12b4ac12ca6cfe0e047bedceea56180"}, + {file = "PyYAML-6.0.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:23502f431948090f597378482b4812b0caae32c22213aecf3b55325e049a6c68"}, + {file = "PyYAML-6.0.2-cp310-cp310-win32.whl", hash = "sha256:2e99c6826ffa974fe6e27cdb5ed0021786b03fc98e5ee3c5bfe1fd5015f42b99"}, + {file = "PyYAML-6.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:a4d3091415f010369ae4ed1fc6b79def9416358877534caf6a0fdd2146c87a3e"}, + {file = "PyYAML-6.0.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:cc1c1159b3d456576af7a3e4d1ba7e6924cb39de8f67111c735f6fc832082774"}, + {file = "PyYAML-6.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1e2120ef853f59c7419231f3bf4e7021f1b936f6ebd222406c3b60212205d2ee"}, + {file = "PyYAML-6.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5d225db5a45f21e78dd9358e58a98702a0302f2659a3c6cd320564b75b86f47c"}, + {file = "PyYAML-6.0.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5ac9328ec4831237bec75defaf839f7d4564be1e6b25ac710bd1a96321cc8317"}, + {file = "PyYAML-6.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ad2a3decf9aaba3d29c8f537ac4b243e36bef957511b4766cb0057d32b0be85"}, + {file = "PyYAML-6.0.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:ff3824dc5261f50c9b0dfb3be22b4567a6f938ccce4587b38952d85fd9e9afe4"}, + {file = "PyYAML-6.0.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:797b4f722ffa07cc8d62053e4cff1486fa6dc094105d13fea7b1de7d8bf71c9e"}, + {file = "PyYAML-6.0.2-cp311-cp311-win32.whl", hash = "sha256:11d8f3dd2b9c1207dcaf2ee0bbbfd5991f571186ec9cc78427ba5bd32afae4b5"}, + {file = "PyYAML-6.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:e10ce637b18caea04431ce14fabcf5c64a1c61ec9c56b071a4b7ca131ca52d44"}, + {file = "PyYAML-6.0.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:c70c95198c015b85feafc136515252a261a84561b7b1d51e3384e0655ddf25ab"}, + {file = "PyYAML-6.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ce826d6ef20b1bc864f0a68340c8b3287705cae2f8b4b1d932177dcc76721725"}, + {file = "PyYAML-6.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f71ea527786de97d1a0cc0eacd1defc0985dcf6b3f17bb77dcfc8c34bec4dc5"}, + {file = "PyYAML-6.0.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9b22676e8097e9e22e36d6b7bda33190d0d400f345f23d4065d48f4ca7ae0425"}, + {file = "PyYAML-6.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80bab7bfc629882493af4aa31a4cfa43a4c57c83813253626916b8c7ada83476"}, + {file = "PyYAML-6.0.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:0833f8694549e586547b576dcfaba4a6b55b9e96098b36cdc7ebefe667dfed48"}, + {file = "PyYAML-6.0.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8b9c7197f7cb2738065c481a0461e50ad02f18c78cd75775628afb4d7137fb3b"}, + {file = "PyYAML-6.0.2-cp312-cp312-win32.whl", hash = "sha256:ef6107725bd54b262d6dedcc2af448a266975032bc85ef0172c5f059da6325b4"}, + {file = "PyYAML-6.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:7e7401d0de89a9a855c839bc697c079a4af81cf878373abd7dc625847d25cbd8"}, + {file = "PyYAML-6.0.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:efdca5630322a10774e8e98e1af481aad470dd62c3170801852d752aa7a783ba"}, + {file = "PyYAML-6.0.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:50187695423ffe49e2deacb8cd10510bc361faac997de9efef88badc3bb9e2d1"}, + {file = "PyYAML-6.0.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0ffe8360bab4910ef1b9e87fb812d8bc0a308b0d0eef8c8f44e0254ab3b07133"}, + {file = "PyYAML-6.0.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:17e311b6c678207928d649faa7cb0d7b4c26a0ba73d41e99c4fff6b6c3276484"}, + {file = "PyYAML-6.0.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:70b189594dbe54f75ab3a1acec5f1e3faa7e8cf2f1e08d9b561cb41b845f69d5"}, + {file = "PyYAML-6.0.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:41e4e3953a79407c794916fa277a82531dd93aad34e29c2a514c2c0c5fe971cc"}, + {file = "PyYAML-6.0.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:68ccc6023a3400877818152ad9a1033e3db8625d899c72eacb5a668902e4d652"}, + {file = "PyYAML-6.0.2-cp313-cp313-win32.whl", hash = "sha256:bc2fa7c6b47d6bc618dd7fb02ef6fdedb1090ec036abab80d4681424b84c1183"}, + {file = "PyYAML-6.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:8388ee1976c416731879ac16da0aff3f63b286ffdd57cdeb95f3f2e085687563"}, + {file = "PyYAML-6.0.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:24471b829b3bf607e04e88d79542a9d48bb037c2267d7927a874e6c205ca7e9a"}, + {file = "PyYAML-6.0.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d7fded462629cfa4b685c5416b949ebad6cec74af5e2d42905d41e257e0869f5"}, + {file = "PyYAML-6.0.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d84a1718ee396f54f3a086ea0a66d8e552b2ab2017ef8b420e92edbc841c352d"}, + {file = "PyYAML-6.0.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9056c1ecd25795207ad294bcf39f2db3d845767be0ea6e6a34d856f006006083"}, + {file = "PyYAML-6.0.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:82d09873e40955485746739bcb8b4586983670466c23382c19cffecbf1fd8706"}, + {file = "PyYAML-6.0.2-cp38-cp38-win32.whl", hash = "sha256:43fa96a3ca0d6b1812e01ced1044a003533c47f6ee8aca31724f78e93ccc089a"}, + {file = "PyYAML-6.0.2-cp38-cp38-win_amd64.whl", hash = "sha256:01179a4a8559ab5de078078f37e5c1a30d76bb88519906844fd7bdea1b7729ff"}, + {file = "PyYAML-6.0.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:688ba32a1cffef67fd2e9398a2efebaea461578b0923624778664cc1c914db5d"}, + {file = "PyYAML-6.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a8786accb172bd8afb8be14490a16625cbc387036876ab6ba70912730faf8e1f"}, + {file = "PyYAML-6.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d8e03406cac8513435335dbab54c0d385e4a49e4945d2909a581c83647ca0290"}, + {file = "PyYAML-6.0.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f753120cb8181e736c57ef7636e83f31b9c0d1722c516f7e86cf15b7aa57ff12"}, + {file = "PyYAML-6.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3b1fdb9dc17f5a7677423d508ab4f243a726dea51fa5e70992e59a7411c89d19"}, + {file = "PyYAML-6.0.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:0b69e4ce7a131fe56b7e4d770c67429700908fc0752af059838b1cfb41960e4e"}, + {file = "PyYAML-6.0.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:a9f8c2e67970f13b16084e04f134610fd1d374bf477b17ec1599185cf611d725"}, + {file = "PyYAML-6.0.2-cp39-cp39-win32.whl", hash = "sha256:6395c297d42274772abc367baaa79683958044e5d3835486c16da75d2a694631"}, + {file = "PyYAML-6.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:39693e1f8320ae4f43943590b49779ffb98acb81f788220ea932a6b6c51004d8"}, + {file = "pyyaml-6.0.2.tar.gz", hash = "sha256:d584d9ec91ad65861cc08d42e834324ef890a082e591037abe114850ff7bbc3e"}, ] [[package]] name = "ruff" -version = "0.3.7" +version = "0.5.7" description = "An extremely fast Python linter and code formatter, written in Rust." optional = false python-versions = ">=3.7" files = [ - {file = "ruff-0.3.7-py3-none-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:0e8377cccb2f07abd25e84fc5b2cbe48eeb0fea9f1719cad7caedb061d70e5ce"}, - {file = "ruff-0.3.7-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:15a4d1cc1e64e556fa0d67bfd388fed416b7f3b26d5d1c3e7d192c897e39ba4b"}, - {file = "ruff-0.3.7-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d28bdf3d7dc71dd46929fafeec98ba89b7c3550c3f0978e36389b5631b793663"}, - {file = "ruff-0.3.7-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:379b67d4f49774ba679593b232dcd90d9e10f04d96e3c8ce4a28037ae473f7bb"}, - {file = "ruff-0.3.7-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c060aea8ad5ef21cdfbbe05475ab5104ce7827b639a78dd55383a6e9895b7c51"}, - {file = "ruff-0.3.7-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:ebf8f615dde968272d70502c083ebf963b6781aacd3079081e03b32adfe4d58a"}, - {file = "ruff-0.3.7-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d48098bd8f5c38897b03604f5428901b65e3c97d40b3952e38637b5404b739a2"}, - {file = "ruff-0.3.7-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:da8a4fda219bf9024692b1bc68c9cff4b80507879ada8769dc7e985755d662ea"}, - {file = "ruff-0.3.7-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c44e0149f1d8b48c4d5c33d88c677a4aa22fd09b1683d6a7ff55b816b5d074f"}, - {file = "ruff-0.3.7-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:3050ec0af72b709a62ecc2aca941b9cd479a7bf2b36cc4562f0033d688e44fa1"}, - {file = "ruff-0.3.7-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:a29cc38e4c1ab00da18a3f6777f8b50099d73326981bb7d182e54a9a21bb4ff7"}, - {file = "ruff-0.3.7-py3-none-musllinux_1_2_i686.whl", hash = "sha256:5b15cc59c19edca917f51b1956637db47e200b0fc5e6e1878233d3a938384b0b"}, - {file = "ruff-0.3.7-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:e491045781b1e38b72c91247cf4634f040f8d0cb3e6d3d64d38dcf43616650b4"}, - {file = "ruff-0.3.7-py3-none-win32.whl", hash = "sha256:bc931de87593d64fad3a22e201e55ad76271f1d5bfc44e1a1887edd0903c7d9f"}, - {file = "ruff-0.3.7-py3-none-win_amd64.whl", hash = "sha256:5ef0e501e1e39f35e03c2acb1d1238c595b8bb36cf7a170e7c1df1b73da00e74"}, - {file = "ruff-0.3.7-py3-none-win_arm64.whl", hash = "sha256:789e144f6dc7019d1f92a812891c645274ed08af6037d11fc65fcbc183b7d59f"}, - {file = "ruff-0.3.7.tar.gz", hash = "sha256:d5c1aebee5162c2226784800ae031f660c350e7a3402c4d1f8ea4e97e232e3ba"}, + {file = "ruff-0.5.7-py3-none-linux_armv6l.whl", hash = "sha256:548992d342fc404ee2e15a242cdbea4f8e39a52f2e7752d0e4cbe88d2d2f416a"}, + {file = "ruff-0.5.7-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:00cc8872331055ee017c4f1071a8a31ca0809ccc0657da1d154a1d2abac5c0be"}, + {file = "ruff-0.5.7-py3-none-macosx_11_0_arm64.whl", hash = "sha256:eaf3d86a1fdac1aec8a3417a63587d93f906c678bb9ed0b796da7b59c1114a1e"}, + {file = "ruff-0.5.7-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a01c34400097b06cf8a6e61b35d6d456d5bd1ae6961542de18ec81eaf33b4cb8"}, + {file = "ruff-0.5.7-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:fcc8054f1a717e2213500edaddcf1dbb0abad40d98e1bd9d0ad364f75c763eea"}, + {file = "ruff-0.5.7-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7f70284e73f36558ef51602254451e50dd6cc479f8b6f8413a95fcb5db4a55fc"}, + {file = "ruff-0.5.7-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:a78ad870ae3c460394fc95437d43deb5c04b5c29297815a2a1de028903f19692"}, + {file = "ruff-0.5.7-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9ccd078c66a8e419475174bfe60a69adb36ce04f8d4e91b006f1329d5cd44bcf"}, + {file = "ruff-0.5.7-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7e31c9bad4ebf8fdb77b59cae75814440731060a09a0e0077d559a556453acbb"}, + {file = "ruff-0.5.7-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8d796327eed8e168164346b769dd9a27a70e0298d667b4ecee6877ce8095ec8e"}, + {file = "ruff-0.5.7-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:4a09ea2c3f7778cc635e7f6edf57d566a8ee8f485f3c4454db7771efb692c499"}, + {file = "ruff-0.5.7-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:a36d8dcf55b3a3bc353270d544fb170d75d2dff41eba5df57b4e0b67a95bb64e"}, + {file = "ruff-0.5.7-py3-none-musllinux_1_2_i686.whl", hash = "sha256:9369c218f789eefbd1b8d82a8cf25017b523ac47d96b2f531eba73770971c9e5"}, + {file = "ruff-0.5.7-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:b88ca3db7eb377eb24fb7c82840546fb7acef75af4a74bd36e9ceb37a890257e"}, + {file = "ruff-0.5.7-py3-none-win32.whl", hash = "sha256:33d61fc0e902198a3e55719f4be6b375b28f860b09c281e4bdbf783c0566576a"}, + {file = "ruff-0.5.7-py3-none-win_amd64.whl", hash = "sha256:083bbcbe6fadb93cd86709037acc510f86eed5a314203079df174c40bbbca6b3"}, + {file = "ruff-0.5.7-py3-none-win_arm64.whl", hash = "sha256:2dca26154ff9571995107221d0aeaad0e75a77b5a682d6236cf89a58c70b76f4"}, + {file = "ruff-0.5.7.tar.gz", hash = "sha256:8dfc0a458797f5d9fb622dd0efc52d796f23f0a1493a9527f4e49a550ae9a7e5"}, ] [[package]] @@ -418,5 +441,5 @@ files = [ [metadata] lock-version = "2.0" -python-versions = "^3.9" -content-hash = "31e8625c12ffbe3c361cd9d65293e1c1024f6b2361a8a5dc3d1f40799f9d020a" +python-versions = "^3.10" +content-hash = "123e743d47ce2a0da8bf405e81ee439cd40a1cc8ef65e578497416649a1b14a4" diff --git a/pyspark/pyproject.toml b/pyspark/pyproject.toml index 9e5aa275d..90e45d99b 100644 --- a/pyspark/pyproject.toml +++ b/pyspark/pyproject.toml @@ -24,13 +24,13 @@ readme = "README.md" packages = [{include = "graphar_pyspark"}] [tool.poetry.dependencies] -python = "^3.9" +python = "^3.10" [tool.poetry.group.spark] optional = true [tool.poetry.group.spark.dependencies] -pyspark = "3.2.2" # TODO: relax requirement when scala part will be available for multiple spark versions +pyspark = "3.5.1" # TODO: relax requirement when scala part will be available for multiple spark versions [tool.poetry.group.lint] optional = true