Skip to content

Commit

Permalink
Feature/yaml perftest (#34)
Browse files Browse the repository at this point in the history
* add perftest files

* add a perf test

* start on nano-dhall

* wip perftest

* add perftest

* Add memoization to grammar.

* add sourcecode 0.3.0 dependency

* Yaml perf test down to 4 seconds

* use Memoize.parse in tests

* wip

* fixes

* add test for LRUHashDict

* fixes

* add test file

* update readme

* update version to 0.2.1

* rename artifacts
  • Loading branch information
winitzki authored Jul 12, 2024
1 parent 6e281f6 commit 22db213
Show file tree
Hide file tree
Showing 27 changed files with 1,665 additions and 49 deletions.
10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,9 @@ Another feature is that some parses need to fail for others to succeed. For exam
identifier. However, `missing` is a keyword and is matched first. To ensure correct parsing, negative lookahead is used
for keywords.

To improve parsing performance, the parsing results for some sub-expressions are memoized.
This is implemented via an add-on library `fastparse-memoize`.

#### Limitations

So far, there are some issues with the Unicode characters:
Expand All @@ -238,8 +241,15 @@ So far, there are some issues with the Unicode characters:

# Release version history


## 0.2.1

- Implemented `fastparse-memoize` to speed up parsing (by 10x and more in some cases).
- Upgrade to fastparse 3.1.x

## 0.2.0

- First version published on Sonatype
- Fixed the regression described in https://github.com/dhall-lang/dhall-haskell/issues/2597
- Support for Yaml and JSON export
- Standalone JAR executable `dhall.jar` with command-line options similar to `dhall-haskell`
Expand Down
102 changes: 77 additions & 25 deletions build.sbt
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
import sbt.Keys.{developers, homepage, scmInfo}
import sbt.Keys.homepage
import sbt.url
import sbtassembly.AssemblyKeys.assembly
import xerial.sbt.Sonatype.{GitHubHosting, sonatypeCentralHost}
import xerial.sbt.Sonatype.GitHubHosting

import scala.collection.immutable.List

val thisReleaseVersion = "0.2.0"
val thisReleaseVersion = "0.2.1"

val scala2V = "2.13.13"
val scala212V = "2.12.19"
Expand All @@ -18,7 +16,7 @@ def munitFramework = new TestFramework("munit.Framework")
val munitTest = "org.scalameta" %% "munit" % "0.7.29" % Test
val assertVerboseTest = "com.eed3si9n.expecty" %% "expecty" % "0.16.0" % Test

val fastparse = "com.lihaoyi" %% "fastparse" % "3.0.2"
val fastparse = "com.lihaoyi" %% "fastparse" % "3.1.1"
val antlr4 = "org.antlr" % "antlr4-runtime" % "4.13.1"
val anltr4_formatter = "com.khubla.antlr4formatter" % "antlr4-formatter-standalone" % "1.2.1" % Provided

Expand All @@ -34,6 +32,7 @@ val cbor1 = "co.nstant.in" % "cbor" % "0.9"
val cbor2 = "com.upokecenter" % "cbor" % "4.5.3"
val reflections = "org.reflections" % "reflections" % "0.10.2"
val mainargs = "com.lihaoyi" %% "mainargs" % "0.7.0"
val sourcecode = "com.lihaoyi" %% "sourcecode" % "0.4.2"

// Not used now:
val flatlaf = "com.formdev" % "flatlaf" % "3.2.2"
Expand All @@ -53,9 +52,6 @@ lazy val publishingOptions = Seq(
description := "Implementation of the Dhall language in Scala, with Scala language bindings",
publishTo := sonatypePublishToBundle.value,
sonatypeProjectHosting := Some(GitHubHosting("winitzki", "scall", "[email protected]")),
// homepage := Some(url("https://github.com/winitzki/scall")),
// scmInfo := Some(ScmInfo(url("https://github.com/winitzki/scall"), "scm:[email protected]:winitzki/scall.git")),
// developers := List(Developer(id = "winitzki", name = "Sergei Winitzki", email = "[email protected]", url = url("https://sites.google.com/site/winitzki"))),
)

lazy val noPublishing =
Expand All @@ -71,11 +67,72 @@ lazy val jdkModuleOptions: Seq[String] = {
lazy val root = (project in file("."))
.settings(noPublishing)
.settings(scalaVersion := scalaV, crossScalaVersions := Seq(scalaV), name := "scall-root")
.aggregate(scall_core, scall_testutils, dhall_codec, abnf, scall_macros, scall_typeclasses, scall_cli)
.aggregate(scall_core, scall_testutils, dhall_codec, abnf, scall_macros, scall_typeclasses, scall_cli, nano_dhall, fastparse_memoize)

lazy val nano_dhall = (project in file("nano-dhall")) // This is a POC project.
.settings(noPublishing)
.settings(
name := "nano-dhall",
scalaVersion := scalaV,
crossScalaVersions := supportedScalaVersions,
Test / parallelExecution := true,
Test / fork := true,
coverageEnabled := false,
scalafmtFailOnErrors := false, // Cannot disable the unicode surrogate pair error in Parser.scala?
testFrameworks += munitFramework,
Test / javaOptions ++= jdkModuleOptions,
Compile / scalacOptions ++= {
CrossVersion.partialVersion(scalaVersion.value) match {
case Some((3, _)) => Seq("-Ydebug")
case Some((2, 12 | 13)) => Seq("-Ypatmat-exhaust-depth", "10") // Cannot make it smaller than 10. Want to speed up compilation.
}
},
ThisBuild / scalacOptions ++= {
CrossVersion.partialVersion(scalaVersion.value) match {
case Some((3, _)) => Seq("-Ykind-projector") // Seq("-Ykind-projector:underscores")
case Some((2, 12 | 13)) => Seq() // Seq("-Xsource:3", "-P:kind-projector:underscore-placeholders")
}
},
// We need to run tests in forked JVM starting with the current directory set to the base resource directory.
// That base directory should contain `./dhall-lang` and all files below that.
Test / baseDirectory := (Test / resourceDirectory).value,
// addCompilerPlugin is a shortcut for libraryDependencies += compilerPlugin(dependency)
// See https://stackoverflow.com/questions/67579041
libraryDependencies ++=
(CrossVersion.partialVersion(scalaVersion.value) match {
case Some((2, _)) => Seq(scala_reflect(scalaVersion.value), kindProjectorPlugin)
case Some((3, _)) => Seq.empty // No need for scala-reflect with Scala 3.
}),
libraryDependencies ++= Seq(
fastparse,
antlr4,
anltr4_formatter,
munitTest,
assertVerboseTest,
enumeratum,
cbor2,
// scalahashing,
// cbor3,
httpRequest,
os_lib % Test,
),
).dependsOn(scall_testutils % "test->compile", scall_typeclasses, fastparse_memoize)

lazy val fastparse_memoize = (project in file("fastparse-memoize"))
.settings(publishingOptions)
.settings(
name := "fastparse-memoize",
scalaVersion := scalaV,
crossScalaVersions := supportedScalaVersions,
testFrameworks += munitFramework,
Test / javaOptions ++= jdkModuleOptions,
libraryDependencies ++= Seq(fastparse, sourcecode, munitTest, assertVerboseTest),
).dependsOn(scall_testutils % "test->compile")

lazy val scall_core = (project in file("scall-core"))
.settings(publishingOptions)
.settings(
name := "dhall-scala-core",
scalaVersion := scalaV,
crossScalaVersions := supportedScalaVersions,
Test / parallelExecution := true,
Expand Down Expand Up @@ -119,11 +176,12 @@ lazy val scall_core = (project in file("scall-core"))
httpRequest,
os_lib % Test,
),
).dependsOn(scall_testutils % "test->compile", scall_typeclasses)
).dependsOn(scall_testutils % "test->compile", scall_typeclasses, fastparse_memoize)

lazy val scall_testutils = (project in file("scall-testutils"))
.settings(publishingOptions)
.settings(
name := "dhall-scala-testutils",
scalaVersion := scalaV,
crossScalaVersions := supportedScalaVersions,
Test / parallelExecution := true,
Expand All @@ -136,6 +194,7 @@ lazy val scall_testutils = (project in file("scall-testutils"))
lazy val dhall_codec = (project in file("dhall-codec"))
.settings(publishingOptions)
.settings(
name := "dhall-scala-bindings",
scalaVersion := scalaV,
crossScalaVersions := supportedScalaVersions,
Test / parallelExecution := true,
Expand All @@ -155,6 +214,7 @@ lazy val dhall_codec = (project in file("dhall-codec"))
lazy val scall_cli = (project in file("scall-cli"))
.settings(publishingOptions)
.settings(
name := "dhall-scala-cli",
scalaVersion := scalaV,
crossScalaVersions := supportedScalaVersions,
Test / parallelExecution := true,
Expand All @@ -178,7 +238,7 @@ lazy val scall_cli = (project in file("scall-cli"))
lazy val abnf = (project in file("abnf"))
.settings(noPublishing)
.settings(
name := "scall-abnf",
name := "dhall-scala-abnf",
scalaVersion := scalaV,
crossScalaVersions := supportedScalaVersions,
Test / parallelExecution := true,
Expand All @@ -189,7 +249,7 @@ lazy val abnf = (project in file("abnf"))
lazy val scall_macros = (project in file("scall-macros"))
.settings(publishingOptions)
.settings(
name := "scall-macros",
name := "dhall-scala-macros",
scalaVersion := scalaV,
crossScalaVersions := supportedScalaVersions,
Test / parallelExecution := true,
Expand All @@ -205,7 +265,7 @@ lazy val scall_macros = (project in file("scall-macros"))
lazy val scall_typeclasses = (project in file("scall-typeclasses"))
.settings(publishingOptions)
.settings(
name := "scall-typeclasses",
name := "dhall-scala-typeclasses",
scalaVersion := scalaV,
crossScalaVersions := supportedScalaVersions,
Test / parallelExecution := true,
Expand All @@ -220,18 +280,10 @@ lazy val scall_typeclasses = (project in file("scall-typeclasses"))

/////////////////////////////////////////////////////////////////////////////////////////////////////
// Publishing to Sonatype Maven repository
publishMavenStyle := true
publishTo := sonatypePublishToBundle.value
sonatypeProfileName := "io.chymyst"
publishMavenStyle := true
publishTo := sonatypePublishToBundle.value
sonatypeProfileName := "io.chymyst"
//ThisBuild / sonatypeCredentialHost := sonatypeCentralHost // Not relevant because io.chymyst was created before 2021.

/*{
val nexus = "https://oss.sonatype.org/"
if (isSnapshot.value)
Some("snapshots" at nexus + "content/repositories/snapshots")
else
Some("releases" at nexus + "service/local/staging/deploy/maven2")
}*/
//
Test / publishArtifact := false
//
Expand Down
122 changes: 122 additions & 0 deletions fastparse-memoize/src/main/scala/io/chymyst/fastparse/Memoize.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
package io.chymyst.fastparse

import fastparse.{P, Parsed, ParserInput, ParserInputSource, ParsingRun}
import fastparse.internal.{Instrument, Msgs}

import scala.collection.mutable

/* See discussion in https://github.com/com-lihaoyi/fastparse/discussions/301 */

final case class PRunData( // Copy all the mutable data from ParsingRun.
terminalMsgs: Msgs,
aggregateMsgs: Msgs,
shortMsg: Msgs,
lastFailureMsg: Msgs,
failureStack: List[(String, Int)],
isSuccess: Boolean,
logDepth: Int,
index: Int,
cut: Boolean,
successValue: Any,
verboseFailures: Boolean,
noDropBuffer: Boolean,
misc: collection.mutable.Map[Any, Any],
) {
override def toString: String = {
s"ParsingRun(index=$index, isSuccess = $isSuccess, successValue = $successValue)"
}

}

object PRunData { // Copy all the mutable data from a parsing run into a PRunData value.
def ofParsingRun[T](pr: ParsingRun[T]): PRunData = PRunData(
pr.terminalMsgs,
pr.aggregateMsgs,
pr.shortMsg,
pr.lastFailureMsg,
pr.failureStack,
pr.isSuccess,
pr.logDepth,
pr.index,
pr.cut,
pr.successValue,
pr.verboseFailures,
pr.noDropBuffer,
mutable.Map.from(pr.misc),
)
}

object Memoize {
val enable = true

def assignToParsingRun[T](data: PRunData, pr: ParsingRun[T]): ParsingRun[T] = { // Assign the mutable data to a given ParsingRun value.
pr.terminalMsgs = data.terminalMsgs
pr.aggregateMsgs = data.aggregateMsgs
pr.shortMsg = data.shortMsg
pr.lastFailureMsg = data.lastFailureMsg
pr.failureStack = data.failureStack
pr.isSuccess = data.isSuccess
pr.logDepth = data.logDepth
pr.index = data.index
pr.cut = data.cut
pr.successValue = data.successValue
pr.verboseFailures = data.verboseFailures
pr.noDropBuffer = data.noDropBuffer
data.misc.foreach { case (k, v) => pr.misc.put(k, v) }
pr
}

@inline private def cacheGrammar[R](cache: mutable.Map[Int, PRunData], parser: => P[_])(implicit p: P[_]): P[R] = {
// The `parser` has not yet been run! And it is mutable. Do not run it twice!
val cachedData: PRunData = cache.getOrElseUpdate(p.index, PRunData.ofParsingRun(parser))
// After the `parser` has been run on `p`, the value of `p` changes and becomes equal to the result of running the parser.
// If the result was cached, we need to assign it to the current value of `p`. This will imitate the side effect of running the parser again.
assignToParsingRun(cachedData, p).asInstanceOf[P[R]]
}

private val cache = new mutable.HashMap[(sourcecode.File, sourcecode.Line), mutable.Map[Int, PRunData]]

private def getOrCreateCache(file: sourcecode.File, line: sourcecode.Line): mutable.Map[Int, PRunData] = {
cache.getOrElseUpdate((file, line), new mutable.HashMap[Int, PRunData])
}

implicit class MemoizeParser[A](parser: => P[A]) {
@inline def memoize(implicit file: sourcecode.File, line: sourcecode.Line, p: P[_]): P[A] = if (enable) {
val cache: mutable.Map[Int, PRunData] = getOrCreateCache(file, line)
cacheGrammar(cache, parser)
} else parser
}

def clearAll(): Unit = cache.values.foreach(_.clear())

def statistics: String = cache.map { case ((file, line), c) => s"$file#$line: ${c.size} entries" }.mkString("\n")

def parse[T](
input: ParserInputSource,
parser: P[_] => P[T],
verboseFailures: Boolean = false,
startIndex: Int = 0,
instrument: Instrument = null,
): Parsed[T] = {
clearAll()
val result = fastparse.parse(input, parser, verboseFailures, startIndex, instrument)
clearAll()
result
}

def parseInputRaw[T](
input: ParserInput,
parser: P[_] => P[T],
verboseFailures: Boolean = false,
startIndex: Int = 0,
traceIndex: Int = -1,
instrument: Instrument = null,
enableLogging: Boolean = true,
): ParsingRun[T] = {
clearAll()
val result = fastparse.parseInputRaw(input, parser, verboseFailures, startIndex, traceIndex, instrument, enableLogging)
clearAll()
result
}

}
Loading

0 comments on commit 22db213

Please sign in to comment.