Merge pull request #40 from Callidon/cache

Enable caching of Basic Graph Patterns
Callidon · Feb 17, 2020 · 6d42a49 · 6d42a49
2 parents 5b2f62b + 3373651
commit 6d42a49
Show file tree

Hide file tree

Showing 16 changed files with 1,136 additions and 95 deletions.
diff --git a/README.md b/README.md
@@ -12,6 +12,7 @@ An open-source framework for building SPARQL query engines in Javascript/Typescr
 * Implements advanced *SPARQL query rewriting techniques* for transparently optimizing SPARQL query processing.
 * Supports [full text search queries](#full-text-search).
 * Supports [Custom SPARQL functions](#custom-functions).
+* Supports [Semantic Caching](#enable-caching), to speed up query evaluation of reccurent patterns.
 * Supports the [SPARQL UPDATE protocol](https://www.w3.org/TR/2013/REC-sparql11-update-20130321/).
 * Supports Basic [Federated SPARQL queries](https://www.w3.org/TR/2013/REC-sparql11-federated-query-20130321/) using **SERVICE clauses**.
 * Customize every step of SPARQL query processing, thanks to *a modular architecture*.
@@ -27,6 +28,7 @@ An open-source framework for building SPARQL query engines in Javascript/Typescr
   * [RDF Graphs](#rdf-graphs)
   * [RDF Datasets](#rdf-datasets)
   * [Running a SPARQL query](#running-a-sparql-query)
+* [Enable caching](#enable-caching)
 * [Full text search](#full-text-search)
 * [Federated SPARQL Queries](#federated-sparql-queries)
 * [Custom Functions](#custom-functions)
@@ -188,6 +190,23 @@ Finally, to run a SPARQL query on your RDF dataset, you need to use the `PlanBui
   )
 ```
 
+# Enable caching
+
+The `sparql-engine` provides support for automatic caching of Basic Graph Pattern evaluation using the [Semantic Cache algorithm](https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=1161590). Basically, the cache will save the results of BGPs already evaluated and, when the engine wants to evaluates a BGP, it will look for the largest subset of the BGP in the cache. If one is available, it will re-use the cached results to speed up query processing.
+
+By default, semantic caching is disabled. You can turn it on/off using the `PlanBuilder.useCache` and `PlanBuilder.disableCache` methods, respectively. The `useCache` method accepts an optional parameter, so you can provide your own implementation of the semantic cache. By defaults, it uses an in-memory [LRU cache](https://callidon.github.io/sparql-engine/classes/lrubgpcache.html) which stores up to 500MB of items for 20 minutes.
+
+```javascript
+// get an instance of a PlanBuilder
+const builder = new PlanBuilder(/* ... */)
+
+// activate the cache
+builder.useCache()
+
+// disable the cache
+builder.disableCache()
+```
+
 # Full Text Search
 
 The `sparql-engine` provides a non-standard full text search functionnality,

diff --git a/package.json b/package.json
@@ -38,6 +38,7 @@
   "homepage": "https://github.com/Callidon/sparql-engine#readme",
   "devDependencies": {
     "@types/lodash": "^4.14.116",
+    "@types/lru-cache": "^5.1.0",
     "@types/node": "^10.14.17",
     "@types/uuid": "^3.4.4",
     "@types/xml": "^1.0.2",
@@ -53,7 +54,9 @@
   },
   "dependencies": {
     "@rdfjs/data-model": "^1.1.2",
+    "binary-search-tree": "^0.2.6",
     "lodash": "^4.17.15",
+    "lru-cache": "^5.1.1",
     "moment": "^2.22.2",
     "n3": "^0.11.3",
     "rdf-string": "^1.3.1",

diff --git a/src/engine/cache/bgp-cache.ts b/src/engine/cache/bgp-cache.ts
@@ -0,0 +1,177 @@
+/* file: bgp-cache.ts
+MIT License
+
+Copyright (c) 2019-2020 Thomas Minier
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the 'Software'), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+'use strict'
+
+import { AsyncCacheEntry, AsyncLRUCache } from './cache-base'
+import { AsyncCache } from './cache-interfaces'
+import { Pipeline } from '../pipeline/pipeline'
+import { PipelineStage } from '../pipeline/pipeline-engine'
+import { Bindings } from '../../rdf/bindings'
+import { Algebra } from 'sparqljs'
+import { rdf, sparql } from '../../utils'
+import { BinarySearchTree } from 'binary-search-tree'
+import { differenceWith, findIndex, maxBy } from 'lodash'
+
+// type alias to simplify the type defintion in this file
+type BasicGraphPattern = Algebra.TripleObject[]
+
+interface SavedBGP {
+  bgp: BasicGraphPattern,
+  key: string
+}
+
+/**
+ * An async cache that stores the solution bindings from BGP evaluation
+ * @author Thomas Minier
+ */
+export interface BGPCache extends AsyncCache<BasicGraphPattern, Bindings, string> {
+
+  /**
+   * Search for a BGP in the cache that is a subset of the input BGP
+   * This method enable the user to use the Semantic caching technique,
+   * to evaluate a BGP using one of its cached subset.
+   * @param bgp - Basic Graph pattern
+   * @return A pair [subset BGP, set of patterns not in cache]
+   */
+  findSubset (bgp: BasicGraphPattern): [BasicGraphPattern, BasicGraphPattern]
+
+  /**
+   * Access the cache and returns a pipeline stage that returns the content of the cache for a given BGP
+   * @param bgp - Cache key, i.e., a Basic Graph pattern
+   * @return A pipeline stage that returns the content of the cache entry for the given BGP
+   */
+  getAsPipeline (bgp: BasicGraphPattern): PipelineStage<Bindings>
+}
+
+/**
+ * An implementation of a {@link BGPCache} using an {@link AsyncLRUCache}
+ * @author Thomas Minier
+ */
+export class LRUBGPCache implements BGPCache {
+  // Main index: for each triple pattern, register the BGP where their occurs
+  // Used to speed up the #findSubset method
+  private readonly _allKeys: BinarySearchTree<string, SavedBGP>
+  // Secondary index: track the triple patterns of each BGP.
+  // Used to clear the primary index when items slides out from the cache
+  private readonly _patternsPerBGP: Map<string, BasicGraphPattern>
+  // AsyncCache used to store set of solution bindings
+  private readonly _cache: AsyncLRUCache<string, Bindings, string>
+
+  /**
+   * Constructor
+   * @param maxSize - The maximum size of the cache
+   * @param maxAge - Maximum age in ms
+   */
+  constructor (maxSize: number, maxAge: number) {
+    this._patternsPerBGP = new Map()
+    this._allKeys = new BinarySearchTree({
+      checkValueEquality: (a: SavedBGP, b: SavedBGP) => a.key === b.key
+    })
+    this._cache = new AsyncLRUCache(maxSize, maxAge, (item: AsyncCacheEntry<Bindings, string>) => {
+      return item.content.length
+    }, (key: string) => {
+      // remove index entries when they slide out
+      if (this._patternsPerBGP.has(key)) {
+        const bgp = this._patternsPerBGP.get(key)!
+        bgp.forEach(pattern => this._allKeys.delete(rdf.hashTriple(pattern), { bgp, key }))
+        this._patternsPerBGP.delete(key)
+      }
+    })
+  }
+
+  has (bgp: BasicGraphPattern): boolean {
+    return this._cache.has(sparql.hashBGP(bgp))
+  }
+
+  update (bgp: BasicGraphPattern, item: Bindings, writerID: string): void {
+    const key = sparql.hashBGP(bgp)
+    if (!this._cache.has(key)) {
+      // update the indexes
+      this._patternsPerBGP.set(key, bgp)
+      bgp.forEach(pattern => this._allKeys.insert(rdf.hashTriple(pattern), { bgp, key }))
+    }
+    this._cache.update(key, item, writerID)
+  }
+
+  get (bgp: BasicGraphPattern): Promise<Bindings[]> | null {
+    return this._cache.get(sparql.hashBGP(bgp))
+  }
+
+  getAsPipeline (bgp: BasicGraphPattern): PipelineStage<Bindings> {
+    const bindings = this.get(bgp)
+    if (bindings === null) {
+      return Pipeline.getInstance().empty()
+    }
+    return Pipeline.getInstance().flatMap(Pipeline.getInstance().from(bindings), x => x.map(b => b.clone()))
+  }
+
+  commit (bgp: BasicGraphPattern, writerID: string): void {
+    this._cache.commit(sparql.hashBGP(bgp), writerID)
+  }
+
+  delete (bgp: BasicGraphPattern, writerID: string): void {
+    const key = sparql.hashBGP(bgp)
+    this._cache.delete(key, writerID)
+    // clear the indexes
+    this._patternsPerBGP.delete(key)
+    bgp.forEach(pattern => this._allKeys.delete(rdf.hashTriple(pattern), { bgp, key }))
+  }
+
+  count (): number {
+    return this._cache.count()
+  }
+
+  findSubset (bgp: BasicGraphPattern): [BasicGraphPattern, BasicGraphPattern] {
+    // if the bgp is in the cache, then the computation is simple
+    if (this.has(bgp)) {
+      return [bgp, []]
+    }
+    // otherwise, we search for all candidate subsets
+    let matches = []
+    for (let pattern of bgp) {
+      const searchResults = this._allKeys
+        .search(rdf.hashTriple(pattern))
+        .filter(v => {
+          // remove all BGPs that are not a subset of the input BGP
+          // we use lodash.findIndex + rdf.tripleEquals to check for triple pattern equality
+          return v.bgp.every(a => findIndex(bgp, b => rdf.tripleEquals(a, b)) > -1)
+        })
+      matches.push({ pattern, searchResults })
+    }
+    // compute the largest subset BGP and the missing patterns (missingPatterns = input_BGP - subset_BGP)
+    let foundPatterns: BasicGraphPattern = []
+    let maxBGPLength = -1
+    for (let match of matches) {
+      if (match.searchResults.length > 0) {
+        const localMax = maxBy(match.searchResults, v => v.bgp.length)
+        if (localMax !== undefined && localMax.bgp.length > maxBGPLength) {
+          maxBGPLength = localMax.bgp.length
+          foundPatterns = localMax.bgp
+        }
+      }
+    }
+    return [foundPatterns, differenceWith(bgp, foundPatterns, rdf.tripleEquals)]
+  }
+}