Skip to content

Commit

Permalink
Merge pull request #40 from Callidon/cache
Browse files Browse the repository at this point in the history
Enable caching of Basic Graph Patterns
  • Loading branch information
Callidon authored Feb 17, 2020
2 parents 5b2f62b + 3373651 commit 6d42a49
Show file tree
Hide file tree
Showing 16 changed files with 1,136 additions and 95 deletions.
19 changes: 19 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ An open-source framework for building SPARQL query engines in Javascript/Typescr
* Implements advanced *SPARQL query rewriting techniques* for transparently optimizing SPARQL query processing.
* Supports [full text search queries](#full-text-search).
* Supports [Custom SPARQL functions](#custom-functions).
* Supports [Semantic Caching](#enable-caching), to speed up query evaluation of reccurent patterns.
* Supports the [SPARQL UPDATE protocol](https://www.w3.org/TR/2013/REC-sparql11-update-20130321/).
* Supports Basic [Federated SPARQL queries](https://www.w3.org/TR/2013/REC-sparql11-federated-query-20130321/) using **SERVICE clauses**.
* Customize every step of SPARQL query processing, thanks to *a modular architecture*.
Expand All @@ -27,6 +28,7 @@ An open-source framework for building SPARQL query engines in Javascript/Typescr
* [RDF Graphs](#rdf-graphs)
* [RDF Datasets](#rdf-datasets)
* [Running a SPARQL query](#running-a-sparql-query)
* [Enable caching](#enable-caching)
* [Full text search](#full-text-search)
* [Federated SPARQL Queries](#federated-sparql-queries)
* [Custom Functions](#custom-functions)
Expand Down Expand Up @@ -188,6 +190,23 @@ Finally, to run a SPARQL query on your RDF dataset, you need to use the `PlanBui
)
```

# Enable caching

The `sparql-engine` provides support for automatic caching of Basic Graph Pattern evaluation using the [Semantic Cache algorithm](https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=1161590). Basically, the cache will save the results of BGPs already evaluated and, when the engine wants to evaluates a BGP, it will look for the largest subset of the BGP in the cache. If one is available, it will re-use the cached results to speed up query processing.

By default, semantic caching is disabled. You can turn it on/off using the `PlanBuilder.useCache` and `PlanBuilder.disableCache` methods, respectively. The `useCache` method accepts an optional parameter, so you can provide your own implementation of the semantic cache. By defaults, it uses an in-memory [LRU cache](https://callidon.github.io/sparql-engine/classes/lrubgpcache.html) which stores up to 500MB of items for 20 minutes.

```javascript
// get an instance of a PlanBuilder
const builder = new PlanBuilder(/* ... */)

// activate the cache
builder.useCache()

// disable the cache
builder.disableCache()
```

# Full Text Search

The `sparql-engine` provides a non-standard full text search functionnality,
Expand Down
3 changes: 3 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
"homepage": "https://github.com/Callidon/sparql-engine#readme",
"devDependencies": {
"@types/lodash": "^4.14.116",
"@types/lru-cache": "^5.1.0",
"@types/node": "^10.14.17",
"@types/uuid": "^3.4.4",
"@types/xml": "^1.0.2",
Expand All @@ -53,7 +54,9 @@
},
"dependencies": {
"@rdfjs/data-model": "^1.1.2",
"binary-search-tree": "^0.2.6",
"lodash": "^4.17.15",
"lru-cache": "^5.1.1",
"moment": "^2.22.2",
"n3": "^0.11.3",
"rdf-string": "^1.3.1",
Expand Down
177 changes: 177 additions & 0 deletions src/engine/cache/bgp-cache.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
/* file: bgp-cache.ts
MIT License
Copyright (c) 2019-2020 Thomas Minier
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the 'Software'), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/

'use strict'

import { AsyncCacheEntry, AsyncLRUCache } from './cache-base'
import { AsyncCache } from './cache-interfaces'
import { Pipeline } from '../pipeline/pipeline'
import { PipelineStage } from '../pipeline/pipeline-engine'
import { Bindings } from '../../rdf/bindings'
import { Algebra } from 'sparqljs'
import { rdf, sparql } from '../../utils'
import { BinarySearchTree } from 'binary-search-tree'
import { differenceWith, findIndex, maxBy } from 'lodash'

// type alias to simplify the type defintion in this file
type BasicGraphPattern = Algebra.TripleObject[]

interface SavedBGP {
bgp: BasicGraphPattern,
key: string
}

/**
* An async cache that stores the solution bindings from BGP evaluation
* @author Thomas Minier
*/
export interface BGPCache extends AsyncCache<BasicGraphPattern, Bindings, string> {

/**
* Search for a BGP in the cache that is a subset of the input BGP
* This method enable the user to use the Semantic caching technique,
* to evaluate a BGP using one of its cached subset.
* @param bgp - Basic Graph pattern
* @return A pair [subset BGP, set of patterns not in cache]
*/
findSubset (bgp: BasicGraphPattern): [BasicGraphPattern, BasicGraphPattern]

/**
* Access the cache and returns a pipeline stage that returns the content of the cache for a given BGP
* @param bgp - Cache key, i.e., a Basic Graph pattern
* @return A pipeline stage that returns the content of the cache entry for the given BGP
*/
getAsPipeline (bgp: BasicGraphPattern): PipelineStage<Bindings>
}

/**
* An implementation of a {@link BGPCache} using an {@link AsyncLRUCache}
* @author Thomas Minier
*/
export class LRUBGPCache implements BGPCache {
// Main index: for each triple pattern, register the BGP where their occurs
// Used to speed up the #findSubset method
private readonly _allKeys: BinarySearchTree<string, SavedBGP>
// Secondary index: track the triple patterns of each BGP.
// Used to clear the primary index when items slides out from the cache
private readonly _patternsPerBGP: Map<string, BasicGraphPattern>
// AsyncCache used to store set of solution bindings
private readonly _cache: AsyncLRUCache<string, Bindings, string>

/**
* Constructor
* @param maxSize - The maximum size of the cache
* @param maxAge - Maximum age in ms
*/
constructor (maxSize: number, maxAge: number) {
this._patternsPerBGP = new Map()
this._allKeys = new BinarySearchTree({
checkValueEquality: (a: SavedBGP, b: SavedBGP) => a.key === b.key
})
this._cache = new AsyncLRUCache(maxSize, maxAge, (item: AsyncCacheEntry<Bindings, string>) => {
return item.content.length
}, (key: string) => {
// remove index entries when they slide out
if (this._patternsPerBGP.has(key)) {
const bgp = this._patternsPerBGP.get(key)!
bgp.forEach(pattern => this._allKeys.delete(rdf.hashTriple(pattern), { bgp, key }))
this._patternsPerBGP.delete(key)
}
})
}

has (bgp: BasicGraphPattern): boolean {
return this._cache.has(sparql.hashBGP(bgp))
}

update (bgp: BasicGraphPattern, item: Bindings, writerID: string): void {
const key = sparql.hashBGP(bgp)
if (!this._cache.has(key)) {
// update the indexes
this._patternsPerBGP.set(key, bgp)
bgp.forEach(pattern => this._allKeys.insert(rdf.hashTriple(pattern), { bgp, key }))
}
this._cache.update(key, item, writerID)
}

get (bgp: BasicGraphPattern): Promise<Bindings[]> | null {
return this._cache.get(sparql.hashBGP(bgp))
}

getAsPipeline (bgp: BasicGraphPattern): PipelineStage<Bindings> {
const bindings = this.get(bgp)
if (bindings === null) {
return Pipeline.getInstance().empty()
}
return Pipeline.getInstance().flatMap(Pipeline.getInstance().from(bindings), x => x.map(b => b.clone()))
}

commit (bgp: BasicGraphPattern, writerID: string): void {
this._cache.commit(sparql.hashBGP(bgp), writerID)
}

delete (bgp: BasicGraphPattern, writerID: string): void {
const key = sparql.hashBGP(bgp)
this._cache.delete(key, writerID)
// clear the indexes
this._patternsPerBGP.delete(key)
bgp.forEach(pattern => this._allKeys.delete(rdf.hashTriple(pattern), { bgp, key }))
}

count (): number {
return this._cache.count()
}

findSubset (bgp: BasicGraphPattern): [BasicGraphPattern, BasicGraphPattern] {
// if the bgp is in the cache, then the computation is simple
if (this.has(bgp)) {
return [bgp, []]
}
// otherwise, we search for all candidate subsets
let matches = []
for (let pattern of bgp) {
const searchResults = this._allKeys
.search(rdf.hashTriple(pattern))
.filter(v => {
// remove all BGPs that are not a subset of the input BGP
// we use lodash.findIndex + rdf.tripleEquals to check for triple pattern equality
return v.bgp.every(a => findIndex(bgp, b => rdf.tripleEquals(a, b)) > -1)
})
matches.push({ pattern, searchResults })
}
// compute the largest subset BGP and the missing patterns (missingPatterns = input_BGP - subset_BGP)
let foundPatterns: BasicGraphPattern = []
let maxBGPLength = -1
for (let match of matches) {
if (match.searchResults.length > 0) {
const localMax = maxBy(match.searchResults, v => v.bgp.length)
if (localMax !== undefined && localMax.bgp.length > maxBGPLength) {
maxBGPLength = localMax.bgp.length
foundPatterns = localMax.bgp
}
}
}
return [foundPatterns, differenceWith(bgp, foundPatterns, rdf.tripleEquals)]
}
}
Loading

0 comments on commit 6d42a49

Please sign in to comment.