From 07cd3cff2646bf433fa800fd4074fa559d65ce07 Mon Sep 17 00:00:00 2001 From: Michal Linhard Date: Tue, 1 Oct 2019 13:02:12 +0200 Subject: [PATCH] Replace server with Go version. Change build process. --- README.md | 192 +---- client/.gitignore | 2 + client/bin/exactly | 44 +- client/exactly/exactly.py | 4 +- client/rpm-prep | 2 + client/setup.py | 18 +- client/setup.sh | 29 + devtools/update_client.sh | 30 - devtools/update_server.sh | 23 - installer/.gitignore | 2 + installer/Dockerfile | 12 + installer/README.md | 12 + installer/build.sh | 32 + installer/rpm-builder/.gitignore | 3 - installer/rpm-builder/Dockerfile | 10 - .../rpm-builder/Dockerfile.maven-repoload | 24 - installer/rpm-builder/build-rpm.sh | 35 - installer/rpm-builder/build.sh | 22 - installer/rpm-builder/maven-repoload.sh | 3 - installer/rpm-builder/package.spec | 30 - installer/rpm-tester/.gitignore | 1 - installer/rpm-tester/Dockerfile | 11 - installer/rpm-tester/build.sh | 32 - installer/rpm-tester/test.sh | 3 - installer/test.sh | 32 + server/.gitignore | 4 +- server/README.md | 128 +++ server/esa/esa.go | 477 +++++++++++ server/esa/esa_test.go | 28 + server/go.mod | 10 + server/go.sum | 8 + server/main.go | 51 ++ server/pom.xml | 115 --- server/search/emptyres.go | 64 ++ server/search/helper_test.go | 156 ++++ server/search/hit.go | 56 ++ server/search/multidoc.go | 297 +++++++ server/search/search.go | 258 ++++++ server/search/search_test.go | 241 ++++++ server/server/client_test.go | 168 ++++ server/server/config.go | 87 ++ server/server/docstore.go | 188 +++++ server/server/docstore_test.go | 47 ++ server/server/dto.go | 46 + server/server/filewalk.go | 189 +++++ server/server/filewalk_test.go | 26 + server/server/helper_test.go | 69 ++ server/server/indexer.go | 146 ++++ server/server/listeners.go | 48 ++ server/server/server.go | 149 ++++ server/server/server_test.go | 181 ++++ server/server/tmpdir_test.go | 63 ++ .../java/sk/linhard/exactly/Document.java | 11 - .../src/main/java/sk/linhard/exactly/Hit.java | 43 - .../java/sk/linhard/exactly/HitContext.java | 28 - .../main/java/sk/linhard/exactly/Search.java | 24 - .../sk/linhard/exactly/SearchBuilder.java | 148 ---- .../java/sk/linhard/exactly/SearchResult.java | 52 -- .../linhard/exactly/StringSearchBuilder.java | 318 ------- .../sk/linhard/exactly/gui/ContentPanel.java | 85 -- .../java/sk/linhard/exactly/gui/Core.java | 116 --- .../sk/linhard/exactly/gui/EntryPanel.java | 104 --- .../sk/linhard/exactly/gui/EntryTable.java | 130 --- .../sk/linhard/exactly/gui/LoadingPanel.java | 126 --- .../sk/linhard/exactly/gui/MainMenuBar.java | 29 - .../sk/linhard/exactly/gui/MainWindow.java | 121 --- .../linhard/exactly/gui/SearchResultItem.java | 51 -- .../sk/linhard/exactly/impl/DefaultHit.java | 53 -- .../exactly/impl/DefaultHitContext.java | 46 - .../linhard/exactly/impl/DefaultSearch.java | 794 ------------------ .../exactly/impl/EnhancedSuffixArray.java | 140 --- .../sk/linhard/exactly/impl/FileLoader.java | 136 --- .../linhard/exactly/impl/HitLineContext.java | 9 - .../impl/IndexingProgressReporter.java | 152 ---- .../exactly/impl/MultiDocumentSearch.java | 262 ------ .../linhard/exactly/impl/SafeHitContext.java | 47 -- .../java/sk/linhard/exactly/impl/sais.java | 449 ---------- .../linhard/exactly/lucene/LuceneSearch.java | 27 - .../exactly/lucene/LuceneSearchBuilder.java | 53 -- .../sk/linhard/exactly/rest/Application.java | 12 - .../linhard/exactly/rest/DocumentRequest.java | 39 - .../exactly/rest/DocumentResponse.java | 49 -- .../exactly/rest/SearchController.java | 41 - .../linhard/exactly/rest/SearchRequest.java | 58 -- .../linhard/exactly/rest/SearchResponse.java | 85 -- .../sk/linhard/exactly/rest/SearchServer.java | 171 ---- .../exactly/rest/SearchServerConfig.java | 17 - .../exactly/rest/SearchServerStats.java | 62 -- .../sk/linhard/exactly/tika/ParseResult.java | 57 -- .../java/sk/linhard/exactly/tika/Search.java | 45 - .../linhard/exactly/tika/TikaFsCrawler.java | 96 --- server/src/main/resources/VERSION | 1 - .../src/main/resources/application.properties | 2 - .../sk/linhard/exactly/gui/WindowTests.java | 8 - .../sk/linhard/exactly/impl/BasicTest.java | 35 - .../linhard/exactly/impl/ByteArrayTests.java | 142 ---- .../exactly/impl/SASearchSeparatorTest.java | 99 --- .../exactly/impl/SearchResultChecker.java | 61 -- .../impl/StringSearchResultChecker.java | 55 -- .../sk/linhard/exactly/impl/StringTests.java | 278 ------ .../sk/linhard/exactly/impl/TestUtil.java | 47 -- 101 files changed, 3355 insertions(+), 5567 deletions(-) create mode 100644 client/rpm-prep create mode 100755 client/setup.sh delete mode 100755 devtools/update_client.sh delete mode 100755 devtools/update_server.sh create mode 100644 installer/.gitignore create mode 100644 installer/Dockerfile create mode 100644 installer/README.md create mode 100755 installer/build.sh delete mode 100644 installer/rpm-builder/.gitignore delete mode 100644 installer/rpm-builder/Dockerfile delete mode 100644 installer/rpm-builder/Dockerfile.maven-repoload delete mode 100755 installer/rpm-builder/build-rpm.sh delete mode 100755 installer/rpm-builder/build.sh delete mode 100755 installer/rpm-builder/maven-repoload.sh delete mode 100644 installer/rpm-builder/package.spec delete mode 100644 installer/rpm-tester/.gitignore delete mode 100644 installer/rpm-tester/Dockerfile delete mode 100755 installer/rpm-tester/build.sh delete mode 100755 installer/rpm-tester/test.sh create mode 100755 installer/test.sh create mode 100644 server/README.md create mode 100644 server/esa/esa.go create mode 100644 server/esa/esa_test.go create mode 100644 server/go.mod create mode 100644 server/go.sum create mode 100644 server/main.go delete mode 100644 server/pom.xml create mode 100644 server/search/emptyres.go create mode 100644 server/search/helper_test.go create mode 100644 server/search/hit.go create mode 100644 server/search/multidoc.go create mode 100644 server/search/search.go create mode 100644 server/search/search_test.go create mode 100644 server/server/client_test.go create mode 100644 server/server/config.go create mode 100644 server/server/docstore.go create mode 100644 server/server/docstore_test.go create mode 100644 server/server/dto.go create mode 100644 server/server/filewalk.go create mode 100644 server/server/filewalk_test.go create mode 100644 server/server/helper_test.go create mode 100644 server/server/indexer.go create mode 100644 server/server/listeners.go create mode 100644 server/server/server.go create mode 100644 server/server/server_test.go create mode 100644 server/server/tmpdir_test.go delete mode 100644 server/src/main/java/sk/linhard/exactly/Document.java delete mode 100644 server/src/main/java/sk/linhard/exactly/Hit.java delete mode 100644 server/src/main/java/sk/linhard/exactly/HitContext.java delete mode 100644 server/src/main/java/sk/linhard/exactly/Search.java delete mode 100644 server/src/main/java/sk/linhard/exactly/SearchBuilder.java delete mode 100644 server/src/main/java/sk/linhard/exactly/SearchResult.java delete mode 100644 server/src/main/java/sk/linhard/exactly/StringSearchBuilder.java delete mode 100644 server/src/main/java/sk/linhard/exactly/gui/ContentPanel.java delete mode 100644 server/src/main/java/sk/linhard/exactly/gui/Core.java delete mode 100644 server/src/main/java/sk/linhard/exactly/gui/EntryPanel.java delete mode 100644 server/src/main/java/sk/linhard/exactly/gui/EntryTable.java delete mode 100644 server/src/main/java/sk/linhard/exactly/gui/LoadingPanel.java delete mode 100644 server/src/main/java/sk/linhard/exactly/gui/MainMenuBar.java delete mode 100644 server/src/main/java/sk/linhard/exactly/gui/MainWindow.java delete mode 100644 server/src/main/java/sk/linhard/exactly/gui/SearchResultItem.java delete mode 100644 server/src/main/java/sk/linhard/exactly/impl/DefaultHit.java delete mode 100644 server/src/main/java/sk/linhard/exactly/impl/DefaultHitContext.java delete mode 100644 server/src/main/java/sk/linhard/exactly/impl/DefaultSearch.java delete mode 100644 server/src/main/java/sk/linhard/exactly/impl/EnhancedSuffixArray.java delete mode 100644 server/src/main/java/sk/linhard/exactly/impl/FileLoader.java delete mode 100644 server/src/main/java/sk/linhard/exactly/impl/HitLineContext.java delete mode 100644 server/src/main/java/sk/linhard/exactly/impl/IndexingProgressReporter.java delete mode 100644 server/src/main/java/sk/linhard/exactly/impl/MultiDocumentSearch.java delete mode 100644 server/src/main/java/sk/linhard/exactly/impl/SafeHitContext.java delete mode 100644 server/src/main/java/sk/linhard/exactly/impl/sais.java delete mode 100644 server/src/main/java/sk/linhard/exactly/lucene/LuceneSearch.java delete mode 100644 server/src/main/java/sk/linhard/exactly/lucene/LuceneSearchBuilder.java delete mode 100644 server/src/main/java/sk/linhard/exactly/rest/Application.java delete mode 100644 server/src/main/java/sk/linhard/exactly/rest/DocumentRequest.java delete mode 100644 server/src/main/java/sk/linhard/exactly/rest/DocumentResponse.java delete mode 100644 server/src/main/java/sk/linhard/exactly/rest/SearchController.java delete mode 100644 server/src/main/java/sk/linhard/exactly/rest/SearchRequest.java delete mode 100644 server/src/main/java/sk/linhard/exactly/rest/SearchResponse.java delete mode 100644 server/src/main/java/sk/linhard/exactly/rest/SearchServer.java delete mode 100644 server/src/main/java/sk/linhard/exactly/rest/SearchServerConfig.java delete mode 100644 server/src/main/java/sk/linhard/exactly/rest/SearchServerStats.java delete mode 100644 server/src/main/java/sk/linhard/exactly/tika/ParseResult.java delete mode 100644 server/src/main/java/sk/linhard/exactly/tika/Search.java delete mode 100644 server/src/main/java/sk/linhard/exactly/tika/TikaFsCrawler.java delete mode 100644 server/src/main/resources/VERSION delete mode 100644 server/src/main/resources/application.properties delete mode 100644 server/src/test/java/sk/linhard/exactly/gui/WindowTests.java delete mode 100644 server/src/test/java/sk/linhard/exactly/impl/BasicTest.java delete mode 100644 server/src/test/java/sk/linhard/exactly/impl/ByteArrayTests.java delete mode 100644 server/src/test/java/sk/linhard/exactly/impl/SASearchSeparatorTest.java delete mode 100644 server/src/test/java/sk/linhard/exactly/impl/SearchResultChecker.java delete mode 100644 server/src/test/java/sk/linhard/exactly/impl/StringSearchResultChecker.java delete mode 100644 server/src/test/java/sk/linhard/exactly/impl/StringTests.java delete mode 100644 server/src/test/java/sk/linhard/exactly/impl/TestUtil.java diff --git a/README.md b/README.md index 1f2d8b9..5adfbef 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Intro -**exactly** is an exact substring search tool. It is ablet to find positions of arbitrary substrings, not just the whole words or similar words +**exactly** is an exact substring search tool. It is able to find positions of arbitrary substrings, not just the whole words or similar words as it is the case with the full-text search tools such as [Apache Lucene](http://lucene.apache.org/). **exactly** builds an index on a set of files (binary or text, doesn't matter) and once the index is computed, it can be queried for occurence of a pattern, i.e. @@ -10,212 +10,58 @@ Let *T* be concatenation of the documents, *T = D1 + sep + D2 + sep + ... + Dn* a query for pattern *P* will find all of the tuples *(p, j)* where it holds that *P* is substring of *Dj* starting at position *p*. -The query can be answered pretty fast, in *O(P+Q)* time (*Q* is the number of the returned *(p,j)* tuples) +The query can be answered pretty fast, in *O(len(P)+Q)* time (len(P) is length of *P* and *Q* is the number of the returned *(p,j)* tuples) Since **exactly** can find position of any pattern anywhere in the set of documents, this comes at the cost of memory. **exactly** uses Enhanced suffix arrays as described in *Replacing suffix trees with enhanced suffix arrays* by *Abouelhoda, Kurtz, Ohlebusch* [article](https://www.sciencedirect.com/science/article/pii/S1570866703000650) So far this is a basic straightforward implementation without any optimizations. The complete data structure takes *~ 25N* bytes of memory (*N* is length of total text *T*) -in the peak and *21N* afterwards. Currently only total text up to 2 GB is supported due to java array indexing limitation. +in the peak and *21N* afterwards. Currently only total text length up to 2 GB is supported. To compute suffix array we use [SA-IS algorithm implementation by Yuta Mori](https://sites.google.com/site/yuta256/sais). -# Demo +## Installation -[![asciicast](https://asciinema.org/a/Pj6xP9ZP0DRz7OFtBJoLaiYXj.png)](https://asciinema.org/a/Pj6xP9ZP0DRz7OFtBJoLaiYXj) +Currently, the installer is only available for Fedora 64-bit system. -# Building and installation -## Requirements - -So far I've only created installer for Fedora 64-bit. I thought that would automatically give us CentOS/RHEL 7 but there's no python3 -on those so it's Fedora only. I haven't figured out a way to build and securely distribute RPMs yet, -so it needs to be built from sources, which requires git and docker. Everything else will be installed only inside of the docker container. - -## Build - -``` -git clone https://github.com/mlinhard/exactly -pushd exactly/installer/rpm-builder -./build.sh -popd -``` - -After this you should find your installer in exactly/installer/rpm-builder/rpm/x86_64 - -## Install - -Just install the RPM with yum/dnf tool. After this you should be able to use the **exactly** tool from command-line. The main dependencies are - -- Java 1.8 JRE - the REST server is currently written as Spring boot REST service -- Python - the exactly command-line console tool is written in Python - -## Non-RPM installation - -I haven't yet produced a convenient installer for other linux distros, but it shouldn't be that hard to make **exactly** running without -the installer. - -### Server -The server is a standard mavenized Java 1.8 project. It needs to be built by standard `mvn clean install` command (in server folder). -This will produce `server/target/exactly-server-.jar` file. This is runnable by - - `java -jar exactly-server-.jar --dir=` - - where `` is the folder to be indexed - -### Client - -You need to be root to perform some of this. Create file `/opt/exactly/lib/python/VERSION` with version string -(it should reflect the current version, e.g. same as in the `exactly-server-.jar` filename). Then the -python client should be installable with `python3 setup.py install` inside of the client folder. After this you could place -client/bin/exactly into your /usr/bin/exactly and you should be fine. - - -# Usage - -## Command line +## Usage `exactly index ` -Will start the REST server at http://localhost:9201 and index for all of the files (recursively) under given `` directory. +Start the indexing server on given root folder. `exactly search` Will start exactly console client where you'll be able to enter search queries. +## Build -## API - -**WARNING: The API is not yet fixed and is subject to change even with bugfix releases.** - -### Server statistics -*GET http://localhost:9201/version* - -will return simple one-line version string (no JSON) - -*GET http://localhost:9201/stats* - -will return server stats, that look like this: - -```json -{ - "indexed_bytes": 110505, - "indexed_files": 39, - "done_crawling": true, - "done_loading": true, - "done_indexing": true -} -``` - -### Search query - -*POST http://localhost:9201/search* (Content-Type: application/json) - -With request data: -```json -{ - "pattern": "cGF0dGVybg==", - "max_hits": 3, - "max_context": 20, - "offset": 0 -} -``` - -Will output something like: -```json -{ - "hits": [ - { - "pos": 286, - "doc_id": "/home/mlinhard/dev/projects/exactly/workspace/exactly/server/src/main/java/sk/linhard/search/Search.java", - "ctx_before": "eHQuCgkgKiAKCSAqIEBwYXJhbSA=", - "ctx_after": "CgkgKiBAcmV0dXJuCgkgKi8KCVM=" - }, - { - "pos": 521, - "doc_id": "/home/mlinhard/dev/projects/exactly/workspace/exactly/server/src/main/java/sk/linhard/search/HitContext.java", - "ctx_before": "IHN0cmluZyArIGxlbmd0aCBvZiA=", - "ctx_after": "CgkgKi8KCWludCBoaWdobGlnaHQ=" - }, - { - "pos": 189, - "doc_id": "/home/mlinhard/dev/projects/exactly/workspace/exactly/server/src/main/java/sk/linhard/search/HitContext.java", - "ctx_before": "IHN0cmluZyBiZWZvcmUgKwogKiA=", - "ctx_after": "ICsgYWZ0ZXIgd2l0aCBoaWdobGk=" - } - ] -} -``` - -#### Request params: - -- **pattern** - Base64 encoded binary string to search for -- **max_hits** - Maximum number of hits to return. Since the pattern can be even a single letter, the search query result size can be potentially quite huge, thus we need to limit the number of hits. -- **max_context** - Max number of bytes before and after the pattern that will be included in each hit to give the context to the position of the found pattern. -- **offset** - Optional parameter, if there are more pattern hits than max_hits, return segment starting at offset in complete hit list - -#### Response format: - -- **hits** - JSON array of hit JSON objects -- **cursor** - JSON Object representing the hit array cursor. If this object is not present this means -that the returned **hits** array is complete. If present this means that the array is only a portion of bigger array -that wasn't returned complete due to **max_hits** limitation - -#### Hit format: - -- **pos** - position of the hit in the document -- **doc_id** - string ID of the document, currently this is a file name -- **ctx_before** - Base64 encoded context *before* the pattern occurence -- **ctx_after** - Base64 encoded context *after* the pattern occurence - -#### Cursor format: +The following snippets assume you checked out this git repository -- **complete_size** - size of the complete search result (number of hits) -- **offset** - offset of this result's segment in the complete array +### Installer -Usually what you want to do if you receive incomplete response (with cursor element present) is to POST /search -again with offset increased by max_hits. +See [Exactly installers](installer) section -### Document retrieval +### Server -*GET http://localhost:9201/document/{document_idx}* +See [Exactly indexing server](server) section -Will retrieve document by its index (order in which it was indexed) +### Client -*POST http://localhost:9201/document* (Content-Type: application/json) +Go into the `client` directory and run `setup.sh` script. This will locally build `exactly-index` golang binary and then include it in virtualenv folder `.venv`. You can then use exactly in a familiar fashion: -With request data: -```json -{ - "document_id": "/home/mlinhard/Documents/textfile1.txt", - "document_index": 3 -} ``` -Can be used to retrieve the documents both by index and their string ID (usually path). - -#### Response format: - -Example -```json -{ - "document_id": "/home/mlinhard/Documents/textfile1.txt", - "document_index": 3, - "content": "cGF0dGVybg==" -} +source .venv/bin/activate +exactly ``` -- **document_id** - Document string ID, usually path -- **document_index** - Document index (order in which it was indexed) -- **content** - Base64 encoded binary document content - # TODO - [ ] Improve console - display document paths and more context and more hits on demand - [ ] Improve console - display server connection / indexing status in status-bar -- [ ] Add JVM stats (mainly memory usage) - [ ] Enhanced suffix array memory optimization - [ ] Hexadecimal console mode (allow search for binary strings) - [ ] Add memory performance test (comparison with Lucene) -- [ ] Change the included Swing GUI code to REST client mode -- [ ] Allow UTF-8 String searches in GUI mode +- [ ] Allow UTF-8 String searches and UTF-8 context display - [ ] Better test coverage - + diff --git a/client/.gitignore b/client/.gitignore index 813c02e..d88aa00 100644 --- a/client/.gitignore +++ b/client/.gitignore @@ -3,3 +3,5 @@ build dist/ exactly.egg-info test.py +.venv +.vscode diff --git a/client/bin/exactly b/client/bin/exactly index b04ebfb..0a237d9 100755 --- a/client/bin/exactly +++ b/client/bin/exactly @@ -13,36 +13,40 @@ Options: --debug Start PyDev debug server. Debug string format: host:port:pydev_src """ from docopt import docopt +import pkg_resources import subprocess import sys -from exactly import console -from exactly import set_debug, set_debug_logging +import json +from exactly import console, set_debug, set_debug_logging +from tempfile import NamedTemporaryFile + def get_version(): - try: - with open("/opt/exactly/lib/python/VERSION", "r") as f: - return f.read() - except: - return "UNKNOWN" + return pkg_resources.require("exactly")[0].version def index(root_folder, debug): - cmds = ['java', '-jar' ] - if debug: - cmds.append('-Dlogging.level.sk.linhard=DEBUG') - cmds.append('/opt/exactly/lib/java/exactly-server.jar') - cmds.append('--dir=' + root_folder) - p = subprocess.Popen(cmds) - try: - return p.wait() - except KeyboardInterrupt: - pass - - + with NamedTemporaryFile(mode="w", prefix="exactly-index-", suffix="-config.json") as tmpfile: + config = { + "listen_address": "localhost:9201", + "num_file_loaders": 4, + "num_file_staters": 4, + "roots": [root_folder], + "ignored_directories": [] + } + json.dump(config, tmpfile) + tmpfile.flush() + p = subprocess.Popen(['exactly-index', '-config=' + tmpfile.name]) + try: + return p.wait() + except KeyboardInterrupt: + pass + + def search(): return console.ExactlyConsole.main() - + if __name__ == '__main__': args = docopt(__doc__, version=get_version()) set_debug_logging(args['--debug-log']) diff --git a/client/exactly/exactly.py b/client/exactly/exactly.py index ec82ed4..163b0ed 100644 --- a/client/exactly/exactly.py +++ b/client/exactly/exactly.py @@ -132,7 +132,7 @@ def get(self, relpath): def post(self, relpath, json_req): try: - return requests.post(self.uri + relpath, json=json.dumps(json_req)) + return requests.post(self.uri + relpath, json=json_req) except ConnectionError: raise Exception("Can't connect to index at " + self.uri + ". Make sure that the index is running") except Exception as e: @@ -157,7 +157,7 @@ def search(self, query): if r.status_code == 200: return SearchResult.from_json(r.json()) else: - return None + raise Exception(f"Unexpected code: {r.status_code}: {r.text}") def stats(self): r = self.get("/stats") diff --git a/client/rpm-prep b/client/rpm-prep new file mode 100644 index 0000000..518209a --- /dev/null +++ b/client/rpm-prep @@ -0,0 +1,2 @@ +%setup -n %{name}-%{unmangled_version} -n %{name}-%{unmangled_version} +%global debug_package %{nil} diff --git a/client/setup.py b/client/setup.py index 5a39cf9..95cbbb9 100644 --- a/client/setup.py +++ b/client/setup.py @@ -1,26 +1,16 @@ from setuptools import setup - - -def get_version(): - try: - with open("/opt/exactly/lib/python/VERSION", "r") as f: - return f.read() - except Exception as e: - print("You need to supply VERSION file") - raise e - - -unified_version = get_version() -print("Using version: " + unified_version) +from os import getenv setup(name='exactly', - version=unified_version, + version=getenv('EXACTLY_VERSION'), description='Binary exact search', url='http://github.com/mlinhard/exactly', author='Michal Linhard', author_email='michal@linhard.sk', license='Apache 2.0', packages=['exactly'], + scripts=['bin/exactly'], + data_files=[('bin', ['bin/exactly-index'])], zip_safe=False, install_requires=[ 'docopt', 'requests' diff --git a/client/setup.sh b/client/setup.sh new file mode 100755 index 0000000..a7ff029 --- /dev/null +++ b/client/setup.sh @@ -0,0 +1,29 @@ +#!/bin/bash +if [ "$1" != "update" ]; then + rm -rf .venv + python3 -m virtualenv .venv +fi + +source .venv/bin/activate + +export EXACTLY_VERSION=`git describe --tags` + +if [ ! -f bin/exactly-index ]; then + if [ ! -f ../server/exactly-index ]; then + pushd ../server + go build -o exactly-index "-ldflags=-s -w -X main.Version=${EXACTLY_VERSION}" + popd + fi + mv ../server/exactly-index bin +fi + +python3 setup.py install + +# for some reaason the setup in virtualenv doesn't copy bin/exactly-index to .venv/bin +find .venv -name exactly-index -exec rm {} \; +mv bin/exactly-index .venv/bin + +rm -rf build dist exactly.egg-info + +deactivate + diff --git a/devtools/update_client.sh b/devtools/update_client.sh deleted file mode 100755 index f2e7b1e..0000000 --- a/devtools/update_client.sh +++ /dev/null @@ -1,30 +0,0 @@ -# This is a script used during development -# It replaces files in current installation of exactly -# /usr/bin/exactly -# /opt/exactly/lib/python -# /usr/local/lib/python3.6/site-packages/exactly-0.1.0_SNAPSHOT-py3.6.egg (using setup.py) - -check_rpm_installed=`rpm -q exactly &> /dev/null && echo success || echo fail` - -if [ ! "${check_rpm_installed}" == "success" ]; then - echo "Exactly must be installed in your system to update it" - exit 1 -fi - -if [ ! -f devtools/update_client.sh ]; then - echo "Must be run from codebase root" - exit 1 -fi - -if [ ! -f server/target/classes/VERSION ]; then - echo "Server must be built to produce VERSION file" - exit 1 -fi - -sudo cp server/target/classes/VERSION /opt/exactly/lib/python/VERSION -sudo cp client/bin/exactly /usr/bin/exactly - -cd client -sudo python3 setup.py install - -sudo rm -rf build dist exactly.egg-info \ No newline at end of file diff --git a/devtools/update_server.sh b/devtools/update_server.sh deleted file mode 100755 index 3da6102..0000000 --- a/devtools/update_server.sh +++ /dev/null @@ -1,23 +0,0 @@ -# This is a script used during development -# It replaces files in current installation of exactly -# /opt/exactly/lib/java/exactly-server.jar - -check_rpm_installed=`rpm -q exactly &> /dev/null && echo success || echo fail` - -if [ ! "${check_rpm_installed}" == "success" ]; then - print "Exactly must be installed in your system to update it" - exit 1 -fi - -if [ ! -f devtools/update_server.sh ]; then - print "Must be run from codebase root" - exit 1 -fi - -if [ ! -f server/target/classes/VERSION ]; then - print "Server must be built to produce VERSION file" - exit 1 -fi - -version=`cat server/target/classes/VERSION` -sudo cp server/target/exactly-server-${version}.jar /opt/exactly/lib/java/exactly-server.jar diff --git a/installer/.gitignore b/installer/.gitignore new file mode 100644 index 0000000..f2b578f --- /dev/null +++ b/installer/.gitignore @@ -0,0 +1,2 @@ +tmp +*.rpm diff --git a/installer/Dockerfile b/installer/Dockerfile new file mode 100644 index 0000000..681d7b3 --- /dev/null +++ b/installer/Dockerfile @@ -0,0 +1,12 @@ +FROM fedora:30 + +RUN yum -y update && yum clean all + +RUN mkdir -p /go && chmod -R 777 /go && \ + yum -y install \ + git \ + golang \ + rpm-build \ + python3 \ + python3-setuptools \ + && yum clean all diff --git a/installer/README.md b/installer/README.md new file mode 100644 index 0000000..c1fb104 --- /dev/null +++ b/installer/README.md @@ -0,0 +1,12 @@ +# Exactly installers + +Currently only **Fedora 30 64-bit RPM** installer build is supported. + +## Installer build + +Run the `build.sh` command. This requires docker installed on your machine. The build process should run entirely in a [fedora:30](https://hub.docker.com/_/fedora) derived docker container +and after it ends an RPM file should appear in the same directory. + +## Installer test + +After you've built the installer, you can check it with the `test.sh` command. It will install the RPM in a fresh [fedora:30](https://hub.docker.com/_/fedora) derived docker container and check if it works properly diff --git a/installer/build.sh b/installer/build.sh new file mode 100755 index 0000000..d33a7b8 --- /dev/null +++ b/installer/build.sh @@ -0,0 +1,32 @@ +#!/bin/sh + +if [ $# == 0 ]; then + # on host machine + rm -rf tmp + mkdir tmp + cp -r ../server tmp + cp -r ../client tmp + cp ./build.sh tmp + docker build . -t exactly-server-builder + docker run -i -t -w /build -v "$PWD/tmp:/build:rw,z" exactly-server-builder /build/build.sh `id -u` `id -g` `git describe --tags` + mv tmp/*.rpm . + rm -rf tmp +else + # inside docker + host_uid=$1 + host_gid=$2 + version=$3 + pushd server + go build -o exactly-index "-ldflags=-s -w -X main.Version=${version}" + popd + mv server/exactly-index client/bin + export EXACTLY_VERSION=${version} + pushd client + python3 setup.py bdist_rpm --binary-only --force-arch x86_64 --prep-script rpm-prep + popd + mv client/dist/*.rpm . + chown -R $host_uid:$host_gid * +fi + + + diff --git a/installer/rpm-builder/.gitignore b/installer/rpm-builder/.gitignore deleted file mode 100644 index 7865745..0000000 --- a/installer/rpm-builder/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -sources -cache -rpm diff --git a/installer/rpm-builder/Dockerfile b/installer/rpm-builder/Dockerfile deleted file mode 100644 index b3e6b79..0000000 --- a/installer/rpm-builder/Dockerfile +++ /dev/null @@ -1,10 +0,0 @@ -FROM exactly-rpm-builder-maven-repoload - -WORKDIR /home/installer - -# refresh sources -RUN rm -rf maven-repoload-server maven-repoload.sh -COPY sources/client /home/installer/client -COPY sources/server /home/installer/server -COPY build-rpm.sh /home/installer/build-rpm.sh -COPY package.spec /home/installer/package.spec diff --git a/installer/rpm-builder/Dockerfile.maven-repoload b/installer/rpm-builder/Dockerfile.maven-repoload deleted file mode 100644 index 6af90b2..0000000 --- a/installer/rpm-builder/Dockerfile.maven-repoload +++ /dev/null @@ -1,24 +0,0 @@ -FROM centos:centos7 - -WORKDIR /home/installer - -RUN yum install -y \ - rpm-build \ - wget \ - java-1.8.0-devel \ - which - -# Download Maven -RUN wget http://tux.rainside.sk/apache/maven/maven-3/3.5.4/binaries/apache-maven-3.5.4-bin.tar.gz && \ - tar xf apache-maven-3.5.4-bin.tar.gz && \ - rm -f apache-maven-3.5.4-bin.tar.gz - -ENV MAVEN_HOME=/home/installer/apache-maven-3.5.4 -ENV PATH=${MAVEN_HOME}/bin:${PATH} - -COPY cache/maven-repoload/server /home/installer/maven-repoload/server -COPY maven-repoload.sh /home/installer/maven-repoload.sh - -# This will download all the maven dependencies into /root/.m2/repository -# We don't want to do this too often -RUN ./maven-repoload.sh diff --git a/installer/rpm-builder/build-rpm.sh b/installer/rpm-builder/build-rpm.sh deleted file mode 100755 index b0ad10d..0000000 --- a/installer/rpm-builder/build-rpm.sh +++ /dev/null @@ -1,35 +0,0 @@ -#!/bin/bash -cd /home/installer/server -mvn clean install - -cd /home/installer -mkdir -p BUILD/usr/bin -mkdir -p BUILD/opt/exactly/lib/java -mkdir -p BUILD/opt/exactly/lib/python - -jar xf server/target/exactly-server-*.jar BOOT-INF/classes/VERSION -mv BOOT-INF/classes/VERSION client/VERSION -rm -rf BOOT-INF - -cd /home/installer/client - -version=`cat VERSION` -rpm_version=`cat VERSION | sed 's/\(.*\)-SNAPSHOT/\1/g'` -rpm_release=1 - -cd /home/installer -mv server/target/exactly-server-$version.jar BUILD/opt/exactly/lib/java/exactly-server.jar -mv client/bin/exactly BUILD/usr/bin -rm -rf client/bin -cp -r client/* BUILD/opt/exactly/lib/python - -rpmbuild -bb --buildroot /home/installer/BUILD \ - --define "version $rpm_version" \ - --define "release $rpm_release" \ - --define "_rpmdir /home/installer/rpm" \ - package.spec - -host_uid=$1 -host_gid=$2 - -chown -R $host_uid:$host_gid rpm diff --git a/installer/rpm-builder/build.sh b/installer/rpm-builder/build.sh deleted file mode 100755 index b3e00e1..0000000 --- a/installer/rpm-builder/build.sh +++ /dev/null @@ -1,22 +0,0 @@ -if [ ! `basename $PWD` == rpm-builder -o ! -f Dockerfile ]; then - echo "You must run this in installer/rpm-builder folder" - exit 1 -fi - -rm -rf sources -mkdir sources -cp -r ../../client sources/client -cp -r ../../server sources/server -rm -rf sources/client/*.pyc - -if [ ! -d cache/maven-repoload ]; then - mkdir -p cache/maven-repoload - cp -r ../../server cache/maven-repoload -fi - -docker build -t exactly-rpm-builder-maven-repoload -f Dockerfile.maven-repoload . -docker build -t exactly-rpm-builder . -rm -rf rpm -mkdir rpm -docker run -i -t -v "$PWD/rpm:/home/installer/rpm:rw,z" exactly-rpm-builder ./build-rpm.sh `id -u` `id -g` - diff --git a/installer/rpm-builder/maven-repoload.sh b/installer/rpm-builder/maven-repoload.sh deleted file mode 100755 index 61b8b81..0000000 --- a/installer/rpm-builder/maven-repoload.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash -cd /home/installer/maven-repoload/server -mvn clean install diff --git a/installer/rpm-builder/package.spec b/installer/rpm-builder/package.spec deleted file mode 100644 index 9d3ffb0..0000000 --- a/installer/rpm-builder/package.spec +++ /dev/null @@ -1,30 +0,0 @@ -Name: exactly -Version: %{version} -Release: %{release} -Summary: Binary exact search -License: Apache License 2.0 -Group: Applications/Productivity -Packager: Michal Linhard -Requires: java-1.8.0-headless python3 python3-setuptools - -%%pre -#noop - -%%post -cd /opt/exactly/lib/python -cp VERSION /tmp/exactly_VERSION -python3 setup.py install - -%%preun -#noop - -%%postun -pip3 uninstall exactly - -%%description -Tool for exact substring search in multiple text or binary files. - -%%files -/usr/bin/exactly -/opt/exactly/lib/java/exactly-server.jar -/opt/exactly/lib/python/* \ No newline at end of file diff --git a/installer/rpm-tester/.gitignore b/installer/rpm-tester/.gitignore deleted file mode 100644 index 916f55d..0000000 --- a/installer/rpm-tester/.gitignore +++ /dev/null @@ -1 +0,0 @@ -rpm diff --git a/installer/rpm-tester/Dockerfile b/installer/rpm-tester/Dockerfile deleted file mode 100644 index 715d6c9..0000000 --- a/installer/rpm-tester/Dockerfile +++ /dev/null @@ -1,11 +0,0 @@ -FROM centos:centos7 - -ARG rpm_name - -WORKDIR /home/installer - -COPY ${rpm_name} ${rpm_name} - -RUN yum install -y ${rpm_name} - -COPY test.sh test.sh diff --git a/installer/rpm-tester/build.sh b/installer/rpm-tester/build.sh deleted file mode 100755 index 74e715f..0000000 --- a/installer/rpm-tester/build.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/bin/bash - -if [ ! `basename $PWD` == rpm-tester -o ! -f Dockerfile ]; then - echo "You must run this in installer/rpm-tester folder" - exit 1 -fi - -arch=x86_64 - -rm -rf rpm -cp -r ../rpm-builder/rpm . - -pushd rpm/${arch} - -if [ `ls -1 | wc -l` != 1 ]; then - echo "There are multiple RPMs, please clean the old RPMs before proceeding" - exit 1 -fi - -rpm_version=`rpm -qp --queryformat="%{version}" exactly-*.${arch}.rpm` -rpm_release=`rpm -qp --queryformat="%{release}" exactly-*.${arch}.rpm` - -popd - -rpm_name=exactly-${rpm_version}-${rpm_release}.x86_64.rpm - -cp ../rpm-builder/rpm/x86_64/$rpm_name . - -docker build --build-arg "rpm_name=${rpm_name}" -t exactly-rpm-tester . - -docker run -i -t exactly-rpm-tester ./test.sh - diff --git a/installer/rpm-tester/test.sh b/installer/rpm-tester/test.sh deleted file mode 100755 index 9039fcb..0000000 --- a/installer/rpm-tester/test.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash - -exactly diff --git a/installer/test.sh b/installer/test.sh new file mode 100755 index 0000000..3504dd7 --- /dev/null +++ b/installer/test.sh @@ -0,0 +1,32 @@ +#!/bin/sh + +if [ $# == 0 ]; then + # on host machine + rm -rf tmp + mkdir tmp + cp *.rpm tmp + cp test.sh tmp + docker run -i -t -w /build -v "$PWD/tmp:/build:rw,z" fedora:30 /build/test.sh `id -u` `id -g` `git describe --tags` + rm -rf tmp +else + # inside docker + host_uid=$1 + host_gid=$2 + version=$3 + dnf install -y *.rpm + mkdir -p /root/.config/exactly-index + echo '{ + "roots" : ["/root/.config/exactly-index"], + "listen_address": "localhost:9201" +}' > /root/.config/exactly-index/server-config.json + exactly-index & + sleep 0.2 + if [ `curl http://localhost:9201/version` == $version ]; then + echo "TEST PASSED" + else + echo "TEST FAILED" + fi +fi + + + diff --git a/server/.gitignore b/server/.gitignore index 616bbad..3f99dde 100644 --- a/server/.gitignore +++ b/server/.gitignore @@ -1,4 +1,4 @@ -target .project .settings -.classpath +search/local-data +exactly-index \ No newline at end of file diff --git a/server/README.md b/server/README.md new file mode 100644 index 0000000..5256724 --- /dev/null +++ b/server/README.md @@ -0,0 +1,128 @@ +# Exactly indexing server + +This is the indexing server for [Exactly](https://github.com/mlinhard/exactly) written in *Go* language. It uses [sais-go](https://github.com/mlinhard/sais-go) which is Go language wrapper for Yuta Mori's [SAIS implementation](https://sites.google.com/site/yuta256/sais) for fast suffix array construction. + +## API + +**WARNING: The API is not yet fixed and is subject to change even with bugfix releases.** + +### Server statistics +*GET http://localhost:9201/version* + +will return simple one-line version string (no JSON) + +*GET http://localhost:9201/stats* + +will return server stats, that look like this: + +```json +{ + "indexed_bytes": 110505, + "indexed_files": 39, + "done_crawling": true, + "done_loading": true, + "done_indexing": true +} +``` + +### Search query + +*POST http://localhost:9201/search* (Content-Type: application/json) + +With request data: +```json +{ + "pattern": "cGF0dGVybg==", + "max_hits": 3, + "max_context": 20, + "offset": 0 +} +``` + +Will output something like: +```json +{ + "hits": [ + { + "pos": 286, + "doc_id": "/home/mlinhard/dev/projects/exactly/workspace/exactly/server/src/main/java/sk/linhard/search/Search.java", + "ctx_before": "eHQuCgkgKiAKCSAqIEBwYXJhbSA=", + "ctx_after": "CgkgKiBAcmV0dXJuCgkgKi8KCVM=" + }, + { + "pos": 521, + "doc_id": "/home/mlinhard/dev/projects/exactly/workspace/exactly/server/src/main/java/sk/linhard/search/HitContext.java", + "ctx_before": "IHN0cmluZyArIGxlbmd0aCBvZiA=", + "ctx_after": "CgkgKi8KCWludCBoaWdobGlnaHQ=" + }, + { + "pos": 189, + "doc_id": "/home/mlinhard/dev/projects/exactly/workspace/exactly/server/src/main/java/sk/linhard/search/HitContext.java", + "ctx_before": "IHN0cmluZyBiZWZvcmUgKwogKiA=", + "ctx_after": "ICsgYWZ0ZXIgd2l0aCBoaWdobGk=" + } + ] +} +``` + +#### Request params: + +- **pattern** - Base64 encoded binary string to search for +- **max_hits** - Maximum number of hits to return. Since the pattern can be even a single letter, the search query result size can be potentially quite huge, thus we need to limit the number of hits. +- **max_context** - Max number of bytes before and after the pattern that will be included in each hit to give the context to the position of the found pattern. +- **offset** - Optional parameter, if there are more pattern hits than max_hits, return segment starting at offset in complete hit list + +#### Response format: + +- **hits** - JSON array of hit JSON objects +- **cursor** - JSON Object representing the hit array cursor. If this object is not present this means +that the returned **hits** array is complete. If present this means that the array is only a portion of bigger array +that wasn't returned complete due to **max_hits** limitation + +#### Hit format: + +- **pos** - position of the hit in the document +- **doc_id** - string ID of the document, currently this is a file name +- **ctx_before** - Base64 encoded context *before* the pattern occurence +- **ctx_after** - Base64 encoded context *after* the pattern occurence + +#### Cursor format: + +- **complete_size** - size of the complete search result (number of hits) +- **offset** - offset of this result's segment in the complete array + +Usually what you want to do if you receive incomplete response (with cursor element present) is to POST /search +again with offset increased by max_hits. + +### Document retrieval + +*GET http://localhost:9201/document/{document_idx}* + +Will retrieve document by its index (order in which it was indexed) + +*POST http://localhost:9201/document* (Content-Type: application/json) + +With request data: +```json +{ + "document_id": "/home/mlinhard/Documents/textfile1.txt", + "document_index": 3 +} +``` +Can be used to retrieve the documents both by index and their string ID (usually path). + +#### Response format: + +Example +```json +{ + "document_id": "/home/mlinhard/Documents/textfile1.txt", + "document_index": 3, + "content": "cGF0dGVybg==" +} +``` + +- **document_id** - Document string ID, usually path +- **document_index** - Document index (order in which it was indexed) +- **content** - Base64 encoded binary document content + diff --git a/server/esa/esa.go b/server/esa/esa.go new file mode 100644 index 0000000..a8d431a --- /dev/null +++ b/server/esa/esa.go @@ -0,0 +1,477 @@ +// Enhanced Suffix Array routines +package esa + +import ( + "bytes" + "fmt" + "log" + "sort" + + "github.com/golang-collections/collections/stack" + "github.com/mlinhard/sais-go/sais" +) + +const ( + UNDEF = int32(-1) + CUNDEF = int16(-1) +) + +type int32stack []int32 + +func (s int32stack) Peek() int32 { + return s[len(s)-1] +} + +func (s int32stack) Push(v int32) int32stack { + return append(s, v) +} + +func (s int32stack) Pop() (int32stack, int32) { + l := len(s) + return s[:l-1], s[l-1] +} + +type Interval struct { + Length int32 + Start int32 + End int32 +} + +type EnhancedSuffixArray struct { + Data []byte + SA []int32 + LCP []int32 + Rank []int32 + Up []int32 + Down []int32 + Next []int32 + rootInterval Interval +} + +func (this *Interval) String() string { + return fmt.Sprintf("%v-[%v, %v]", this.Length, this.Start, this.End) +} + +func New(data []byte) (*EnhancedSuffixArray, error) { + esa := newESA(data) + err := esa.computeSA() + if err != nil { + return nil, err + } + esa.computeLCPKeepRank(false) + esa.computeUpDown() + esa.computeNext() + esa.rootInterval = Interval{0, 0, int32(len(esa.SA) - 1)} + return esa, nil +} + +func (esa *EnhancedSuffixArray) findNonExistentChar(parent *Interval, sepLen int32, occurence []bool) int16 { + for i := range occurence { + occurence[i] = false + } + esa.forEachChild(parent, func(child *Interval) { + if esa.SA[child.Start]+sepLen < int32(len(esa.Data)) { + edgeStart := esa.Data[esa.SA[child.Start]+sepLen] + occurence[edgeStart] = true + } + }) + for i := int16(0); i < int16(len(occurence)); i++ { + if !occurence[i] { + return i + } + } + return CUNDEF +} + +func (esa *EnhancedSuffixArray) buildSeparator(saIdx int32, sepLen int32, tail byte) []byte { + separator := make([]byte, sepLen+1) + dataStart := esa.SA[saIdx] + for i := int32(0); i < sepLen; i++ { + separator[i] = esa.Data[dataStart+i] + } + separator[sepLen] = tail + return separator +} + +type sepLenInterval struct { + sepLen int32 + Interval +} + +func (esa *EnhancedSuffixArray) findSeparator() []byte { + var intervalStack stack.Stack + occurenceBuf := make([]bool, 256) + intervalStack.Push(sepLenInterval{0, esa.rootInterval}) + + for intervalStack.Len() != 0 { + t := intervalStack.Pop().(sepLenInterval) + nonExistentChar := esa.findNonExistentChar(&t.Interval, t.sepLen, occurenceBuf) + if nonExistentChar != CUNDEF { + return esa.buildSeparator(t.Interval.Start, t.sepLen, byte(nonExistentChar)) + } else { + esa.forEachChild(&t.Interval, func(child *Interval) { + intervalStack.Push(sepLenInterval{t.sepLen + 1, *child}) + }) + } + } + + return nil +} + +func NewMulti(combinedContent []byte, offsets []int32) (*EnhancedSuffixArray, []byte, error) { + separatorEsa := newESA(combinedContent) + err := separatorEsa.computeSA() + if err != nil { + return nil, nil, err + } + separatorEsa.computeLCPKeepRank(true) + separatorEsa.computeUpDown() + separatorEsa.computeNext() + separatorEsa.rootInterval = Interval{0, 0, int32(len(separatorEsa.SA) - 1)} + separator := separatorEsa.findSeparator() + separatorEsa.introduceSeparators(offsets, separator) + esa, err := New(separatorEsa.Data) + if err != nil { + return nil, nil, err + } + return esa, separator, nil +} + +func newESA(data []byte) *EnhancedSuffixArray { + esa := new(EnhancedSuffixArray) + esa.Data = data + return esa +} + +func (esa *EnhancedSuffixArray) computeSA() error { + n := len(esa.Data) + esa.SA = make([]int32, n+1) + esa.SA[n] = UNDEF + return sais.Sais32(esa.Data, esa.SA[:n]) +} + +func (esa *EnhancedSuffixArray) Print() string { + s := " i: SA[i] lcp[i] up[i] down[i] next[i] suffix[SA[i]]\n" + for i := range esa.SA { + suffixStart := esa.SA[i] + suffix := "$" + if suffixStart != UNDEF { + suffix = string(esa.Data[suffixStart:]) + } + s += fmt.Sprintf("%2v: %4v %6v %5v %7v %7v %v\n", i, suffixStart, esa.LCP[i], esa.Up[i], esa.Down[i], esa.Next[i], suffix) + } + return s +} + +func (esa *EnhancedSuffixArray) computeLCPKeepRank(keepRank bool) { + start := (int32)(0) + length := (int32)(len(esa.Data)) + esa.Rank = make([]int32, length) + for i := (int32)(0); i < length; i++ { + esa.Rank[esa.SA[i]] = i + } + h := (int32)(0) + esa.LCP = make([]int32, length+1) + for i := (int32)(0); i < length; i++ { + k := esa.Rank[i] + if k == 0 { + esa.LCP[k] = -1 + } else { + j := esa.SA[k-1] + for i+h < length && j+h < length && esa.Data[start+i+h] == esa.Data[start+j+h] { + h++ + } + esa.LCP[k] = h + } + if h > 0 { + h-- + } + } + esa.LCP[0] = 0 + esa.LCP[length] = 0 + if !keepRank { + esa.Rank = nil + } +} + +func (esa *EnhancedSuffixArray) computeUpDown() { + esa.Up = make([]int32, len(esa.LCP)) + esa.Down = make([]int32, len(esa.LCP)) + for i := range esa.Up { + esa.Up[i] = UNDEF + esa.Down[i] = UNDEF + } + lastIndex := UNDEF + var stack int32stack + stack = stack.Push(0) + for i := (int32)(1); i < (int32)(len(esa.LCP)); i++ { + for esa.LCP[i] < esa.LCP[stack.Peek()] { + stack, lastIndex = stack.Pop() + if esa.LCP[i] <= esa.LCP[stack.Peek()] && esa.LCP[stack.Peek()] != esa.LCP[lastIndex] { + esa.Down[stack.Peek()] = lastIndex + } + } + if lastIndex != UNDEF { + esa.Up[i] = lastIndex + lastIndex = UNDEF + } + stack = stack.Push(i) + } +} + +func (esa *EnhancedSuffixArray) computeNext() { + esa.Next = make([]int32, len(esa.LCP)) + for i := range esa.Up { + esa.Next[i] = UNDEF + } + var stack int32stack + var lastIndex int32 + stack = stack.Push(0) + for i := (int32)(0); i < (int32)(len(esa.LCP)); i++ { + for esa.LCP[i] < esa.LCP[stack.Peek()] { + stack, _ = stack.Pop() + } + if esa.LCP[i] == esa.LCP[stack.Peek()] { + stack, lastIndex = stack.Pop() + esa.Next[lastIndex] = i + } + stack = stack.Push(i) + } +} + +func (esa *EnhancedSuffixArray) introduceSeparators(offsets []int32, separator []byte) { + separatorExtraSpace := (int32)((len(offsets) - 1) * len(separator)) + newData := make([]byte, (int32)(len(esa.Data))+separatorExtraSpace) + lastIdx := (int32)(len(offsets) - 1) + for i := (int32)(0); i < lastIdx; i++ { + oldOffset := offsets[i] + separatorExtraSpace = i * (int32)(len(separator)) + esa.MoveSegment(oldOffset, offsets[i+1], separatorExtraSpace, newData) + offsets[i] = oldOffset + separatorExtraSpace + } + oldOffset := offsets[lastIdx] + separatorExtraSpace = lastIdx * (int32)(len(separator)) + esa.MoveSegment(oldOffset, (int32)(len(esa.Data)), separatorExtraSpace, newData) + offsets[lastIdx] = oldOffset + separatorExtraSpace + + for i := (int32)(0); i < (int32)(len(separator)); i++ { + sepChar := separator[i] + for j := (int32)(1); j < (int32)(len(offsets)); j++ { + newData[offsets[j]-(int32)(len(separator))+i] = sepChar + } + } + + esa.Data = newData +} + +func (esa *EnhancedSuffixArray) MoveSegment(start, end, separatorExtraSpace int32, newData []byte) { + for i := start; i < end; i++ { + newData[i+separatorExtraSpace] = esa.Data[i] + } + for j := start; j < end; j++ { + esa.SA[esa.Rank[j]] += separatorExtraSpace + } +} + +func (esa *EnhancedSuffixArray) interval(i, j int32) *Interval { + cup := esa.Up[j] + if cup < j && i < cup { + return &Interval{esa.LCP[cup], i, j} + } + return &Interval{esa.LCP[esa.Down[i]], i, j} +} + +func (esa *EnhancedSuffixArray) createInterval(parent *Interval, childStart, childEnd int32) *Interval { + if childEnd == UNDEF { + childEnd = parent.End + } + if childStart+1 < childEnd { + return esa.interval(childStart, childEnd) + } else if childStart != childEnd { + return &Interval{parent.Length, childStart, childEnd} + } else { + return nil + } +} + +func (esa *EnhancedSuffixArray) firstIndex(parent *Interval) int32 { + if *parent == esa.rootInterval { + return 0 + } + cup := esa.Up[parent.End] + if cup < parent.End && parent.Start < cup { + return cup + } else { + return esa.Down[parent.Start] + } +} + +func (esa *EnhancedSuffixArray) edgeChar(parent *Interval, child *Interval) int16 { + pos := esa.SA[child.Start] + parent.Length + if pos >= int32(len(esa.Data)) { + return -1 + } + return int16(esa.Data[pos]) +} + +type intervalIterator struct { + esa *EnhancedSuffixArray + parent *Interval + start, end int32 + _next *Interval +} + +func (iter *intervalIterator) hasNext() bool { + return iter._next != nil +} + +func (iter *intervalIterator) next() *Interval { + r := iter._next + if iter.end != UNDEF { + iter.start = iter.end + iter.end = iter.esa.Next[iter.start] + iter._next = iter.esa.createInterval(iter.parent, iter.start, iter.end) + } else { + iter._next = nil + } + return r +} + +func (esa *EnhancedSuffixArray) firstLIndex(parent *Interval) int32 { + if *parent == esa.rootInterval { + return 0 + } else { + cup := esa.Up[parent.End] + if cup < parent.End && parent.Start < cup { + return cup + } else { + return esa.Down[parent.Start] + } + } +} + +func (esa *EnhancedSuffixArray) getChildren(parent *Interval) *intervalIterator { + iter := new(intervalIterator) + iter.esa = esa + iter.parent = parent + iter.start = parent.Start + iter.end = esa.firstLIndex(parent) + if iter.end == iter.start { + iter.end = esa.Next[iter.start] + } + iter._next = esa.createInterval(parent, iter.start, iter.end) + return iter +} + +func (esa *EnhancedSuffixArray) getInterval(parent *Interval, c int16) *Interval { + iter := esa.getChildren(parent) + for iter.hasNext() { + child := iter.next() + if c == esa.edgeChar(parent, child) { + return child + } + } + return nil +} + +func (esa *EnhancedSuffixArray) acceptInterval(parent *Interval, childStart, childEnd int32, consumer func(*Interval)) { + if childEnd == UNDEF { + childEnd = parent.End + } + if childStart+1 < childEnd { + consumer(esa.interval(childStart, childEnd)) + } else if childStart != childEnd { + consumer(&Interval{parent.Length, childStart, childEnd}) + } +} + +func (esa *EnhancedSuffixArray) forEachChild(parent *Interval, consumer func(*Interval)) { + i := parent.Start + nexti := esa.firstLIndex(parent) + if nexti == i { + nexti = esa.Next[i] + } + esa.acceptInterval(parent, i, nexti, consumer) + for nexti != UNDEF { + i = nexti + nexti = esa.Next[i] + esa.acceptInterval(parent, i, nexti, consumer) + } +} + +func (esa *EnhancedSuffixArray) Match(pattern []byte, dataOff int32, patternOff int32, mlen int32) bool { + for i := int32(0); i < mlen; i++ { + pIdx := patternOff + i + dIdx := dataOff + i + if pIdx >= int32(len(pattern)) || dIdx >= int32(len(esa.Data)) || pattern[pIdx] != esa.Data[dIdx] { + return false + } + } + return true +} + +func min32(a, b int32) int32 { + if a < b { + return a + } + return b +} + +func (esa *EnhancedSuffixArray) Find(pattern []byte, match func([]byte, int32, int32, int32) bool) *Interval { + plen := int32(len(pattern)) + if pattern == nil || plen == 0 { + panic("You must specify non-empty pattern") + } + c := int32(0) + queryFound := true + intv := esa.getInterval(&esa.rootInterval, int16(pattern[c])) + intvLen := int32(0) + for intv != nil && c < plen && queryFound { + intvLen = intv.End - intv.Start + if intvLen > 1 { + min := min32(intv.Length, plen) + queryFound = match(pattern, esa.SA[intv.Start]+c, c, min-c) + c = min + if c < plen { + intv = esa.getInterval(intv, int16(pattern[c])) + } + } else { + queryFound = match(pattern, esa.SA[intv.Start]+c, c, plen-c) + break + } + } + if intv != nil && queryFound { + return &Interval{plen, intv.Start, intv.End} + } + return nil +} + +type sortableBA [][]byte + +func (b sortableBA) Len() int { + return len(b) +} + +func (b sortableBA) Less(i, j int) bool { + // bytes package already implements Comparable for []byte. + switch bytes.Compare(b[i], b[j]) { + case -1: + return true + case 0, 1: + return false + default: + log.Panic("not fail-able with `bytes.Comparable` bounded [-1, 1].") + return false + } +} + +func (b sortableBA) Swap(i, j int) { + b[j], b[i] = b[i], b[j] +} + +// Public +func SortBAs(src [][]byte) [][]byte { + sorted := sortableBA(src) + sort.Sort(sorted) + return sorted +} diff --git a/server/esa/esa_test.go b/server/esa/esa_test.go new file mode 100644 index 0000000..72d7d9e --- /dev/null +++ b/server/esa/esa_test.go @@ -0,0 +1,28 @@ +package esa + +import ( + "testing" +) + +func TestSuffixArray(t *testing.T) { + bytes := ([]byte)("ABRACADABRA") + esa, err := New(bytes) + if err != nil { + t.Errorf("Error creating enhanced suffix array: %v", err) + return + } + var suffixes [][]byte + for i := range bytes { + suffixes = append(suffixes, bytes[i:]) + } + suffixes = SortBAs(suffixes) + for i := range suffixes { + expected := string(suffixes[i]) + computed := string(bytes[esa.SA[i]:]) + if expected != computed { + t.Errorf("for i=%v SA[i]=%v suffix expected: %v computed %v", i, esa.SA[i], expected, computed) + return + } + } + +} diff --git a/server/go.mod b/server/go.mod new file mode 100644 index 0000000..673abeb --- /dev/null +++ b/server/go.mod @@ -0,0 +1,10 @@ +module github.com/mlinhard/exactly/server + +require ( + github.com/dustin/go-humanize v1.0.0 + github.com/golang-collections/collections v0.0.0-20130729185459-604e922904d3 + github.com/karrick/godirwalk v1.11.3 + github.com/mlinhard/sais-go v0.1.0 +) + +go 1.12 diff --git a/server/go.sum b/server/go.sum new file mode 100644 index 0000000..d670179 --- /dev/null +++ b/server/go.sum @@ -0,0 +1,8 @@ +github.com/dustin/go-humanize v1.0.0 h1:VSnTsYCnlFHaM2/igO1h6X3HA71jcobQuxemgkq4zYo= +github.com/dustin/go-humanize v1.0.0/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk= +github.com/golang-collections/collections v0.0.0-20130729185459-604e922904d3 h1:zN2lZNZRflqFyxVaTIU61KNKQ9C0055u9CAfpmqUvo4= +github.com/golang-collections/collections v0.0.0-20130729185459-604e922904d3/go.mod h1:nPpo7qLxd6XL3hWJG/O60sR8ZKfMCiIoNap5GvD12KU= +github.com/karrick/godirwalk v1.11.3 h1:ZrtYOzzHRzItdU1MvkK3CLlhC4m3YTWFgGyiBuSCQSY= +github.com/karrick/godirwalk v1.11.3/go.mod h1:H5KPZjojv4lE+QYImBI8xVtrBRgYrIVsaRPx4tDPEn4= +github.com/mlinhard/sais-go v0.1.0 h1:lnKQTaL8WgMP7vuuQPJz0qG/ai4CQ1DH/a6pO63U3Eo= +github.com/mlinhard/sais-go v0.1.0/go.mod h1:/IZp4BaUbSBZv8bx5YNQKbOxokp9CmrU6/BG2mBVDDw= diff --git a/server/main.go b/server/main.go new file mode 100644 index 0000000..a985402 --- /dev/null +++ b/server/main.go @@ -0,0 +1,51 @@ +package main + +import ( + "flag" + "fmt" + "os" + "os/signal" + "syscall" + + "github.com/dustin/go-humanize" + "github.com/mlinhard/exactly/server/server" +) + +var Version string + +func main() { + configFile := flag.String("config", "-", "Configuration file") + flag.Parse() + var config *server.ServerConfig + var err error + if configFile != nil && *configFile != "-" { + config, err = server.LoadConfigFrom(*configFile) + } else { + config, err = server.LoadConfig() + } + if err != nil { + fmt.Printf("ERROR: %v\n", err) + return + } + server := server.NewServer(Version, config) + + server.OnDoneCrawling(func(size int, totalBytes int) { + fmt.Printf("Found %v files, %v. Loading ...\n", size, humanize.Bytes(uint64(totalBytes))) + }) + server.OnDoneLoading(func(size int, totalBytes int) { + fmt.Printf("Indexing ...\n") + }) + server.OnDoneIndexing(func(size int, totalBytes int) { + fmt.Printf("Ready to search.\n") + }) + server.OnError(func(err error) { + fmt.Printf("ERROR: %v\n", err) + }) + + done := make(chan os.Signal, 1) + signal.Notify(done, os.Interrupt, syscall.SIGINT, syscall.SIGTERM) + server.Start() + <-done + server.Stop() + fmt.Printf("Server terminated.\n") +} diff --git a/server/pom.xml b/server/pom.xml deleted file mode 100644 index f0e1305..0000000 --- a/server/pom.xml +++ /dev/null @@ -1,115 +0,0 @@ - - 4.0.0 - sk.linhard.exactly - exactly-server - 0.1.0-SNAPSHOT - - - org.springframework.boot - spring-boot-starter-parent - 2.0.3.RELEASE - - - - 1.8 - 1.8 - 25.1-jre - 7.4.0 - - - - - org.springframework.boot - spring-boot-starter-web - - - org.springframework.boot - spring-boot-starter-test - test - - - com.jayway.jsonpath - json-path - test - - - org.slf4j - slf4j-api - - - ch.qos.logback - logback-classic - - - org.apache.tika - tika-parsers - 1.17 - - - junit - junit - test - - - org.apache.commons - commons-lang3 - - - com.google.guava - guava - ${version.guava} - - - org.apache.lucene - lucene-core - ${version.lucene} - - - - - - src/main/resources - - VERSION - - true - - - src/main/resources - - VERSION - - false - - - - - maven-compiler-plugin - - ${java.version.source} - ${java.version.target} - -Xlint:unchecked - - - - - org.springframework.boot - spring-boot-maven-plugin - - - - - - - spring-releases - https://repo.spring.io/libs-release - - - - - spring-releases - https://repo.spring.io/libs-release - - - \ No newline at end of file diff --git a/server/search/emptyres.go b/server/search/emptyres.go new file mode 100644 index 0000000..b241a0b --- /dev/null +++ b/server/search/emptyres.go @@ -0,0 +1,64 @@ +// Empty search result logic +package search + +type EmptySearchResult []byte + +func (this EmptySearchResult) IsEmpty() bool { + return true +} + +func (this EmptySearchResult) Size() int { + return 0 +} + +func (this EmptySearchResult) Hit(i int) Hit { + panic("Empty search result has no hits") +} + +func (this EmptySearchResult) PatternLength() int { + return len(this) +} + +func (this EmptySearchResult) Pattern() []byte { + return this +} + +func (this EmptySearchResult) HasGlobalPosition(position int) bool { + return false +} + +func (this EmptySearchResult) HitWithGlobalPosition(position int) Hit { + panic("Empty search result has no hits") +} + +func (this EmptySearchResult) HasPosition(document, position int) bool { + return false +} + +func (this EmptySearchResult) HitWithPosition(document, position int) Hit { + panic("Empty search result has no hits") +} + +func (this EmptySearchResult) Positions() []int { + return make([]int, 0) +} + +func (this EmptySearchResult) document(hitIdx int) *Document { + panic("Empty search result has no hits") +} + +func (this EmptySearchResult) position(hitIdx int) int { + panic("Empty search result has no hits") +} + +func (this EmptySearchResult) globalPosition(hitIdx int) int { + panic("Empty search result has no hits") +} + +func (this EmptySearchResult) charContext(hitIndex int, charsBefore, charsAfter int) HitContext { + panic("Empty search result has no hits") +} + +func (this EmptySearchResult) lineContext(hitIndex int, linesBefore, linesAfter int) HitContext { + panic("Empty search result has no hits") +} diff --git a/server/search/helper_test.go b/server/search/helper_test.go new file mode 100644 index 0000000..ddc0497 --- /dev/null +++ b/server/search/helper_test.go @@ -0,0 +1,156 @@ +package search + +import ( + "fmt" + "testing" + + "github.com/golang-collections/collections/set" +) + +type TestSearch struct { + search Search + t *testing.T +} + +type TestSearchResult struct { + TestSearch + result SearchResult +} + +type TestHit struct { + TestSearchResult + hit Hit +} + +func NewSearchIn(t *testing.T, text ...string) *TestSearch { + if len(text) == 0 { + t.Errorf("You have to specify some texts") + return nil + } + if len(text) == 1 { + search, err := NewSingle("testDoc", []byte(text[0])) + if err != nil { + t.Error(err) + return nil + } + return &TestSearch{search, t} + } else { + offsets, combinedData := combine(text) + search, err := NewMulti(combinedData, offsets, testIds(len(text))) + if err != nil { + t.Error(err) + return nil + } + return &TestSearch{search, t} + } +} + +func testIds(n int) []string { + ids := make([]string, n) + for i := range ids { + ids[i] = fmt.Sprintf("testDoc%v", i) + } + return ids +} + +func combine(text []string) ([]int, []byte) { + totalLength := 0 + offsets := make([]int, len(text)) + for i := range text { + offsets[i] = totalLength + totalLength += len([]byte(text[i])) + } + r := make([]byte, totalLength) + for i := range text { + data := []byte(text[i]) + offset := offsets[i] + for j := range data { + r[offset+j] = data[j] + } + } + return offsets, r +} + +func toSet(array []int) *set.Set { + r := set.New() + for i := range array { + r.Insert(array[i]) + } + return r +} + +func (ts *TestSearch) Find(text string) *TestSearchResult { + return &TestSearchResult{*ts, ts.search.Find([]byte(text))} +} + +func (tsr *TestSearchResult) AssertHitCount(hitCount int) { + if tsr.result.Size() != hitCount { + tsr.t.Errorf("Unexpected hit count %v (expected %v)", tsr.result.Size(), hitCount) + } +} + +func (tsr *TestSearchResult) AssertPositions(positions ...int) { + computedPositions := tsr.result.Positions() + expectedSet := toSet(positions) + computedSet := toSet(computedPositions) + if !(expectedSet.SubsetOf(computedSet) && computedSet.SubsetOf(expectedSet)) { + tsr.t.Errorf("Expected positions for pattern %v: %v, but got %v", tsr.result.Pattern(), positions, computedPositions) + } +} + +func (tsr *TestSearchResult) AssertSingleHit() *TestHit { + if tsr.result.Size() != 1 { + tsr.t.Errorf("Expected single hit but got %v", tsr.result.Size()) + } + return &TestHit{*tsr, tsr.result.Hit(0)} +} + +func (th *TestHit) AssertPosition(pos int) *TestHit { + if th.hit.Position() != pos { + th.t.Errorf("Expected position %v but got %v", pos, th.hit.Position()) + } + return th +} + +func (th *TestHit) AssertDocument(doc int) *TestHit { + docIndex := th.hit.Document().Index + if docIndex != doc { + th.t.Errorf("Expected document %v but got %v", doc, docIndex) + } + return th +} + +func (th *TestHit) AssertCtx(maxCtx int, leftCtx string, rightCtx string) *TestHit { + ctx := th.hit.CharContext(maxCtx, maxCtx) + aLeftCtx := string(ctx.Before()) + aRightCtx := string(ctx.After()) + if leftCtx != aLeftCtx { + th.t.Errorf("Expected left context %v got %v", leftCtx, aLeftCtx) + } + if rightCtx != aRightCtx { + th.t.Errorf("Expected right context %v got %v", rightCtx, aRightCtx) + } + return th +} + +func (ts *TestSearch) AssertSingleHitCtx(pattern string, doc int, pos int, maxCtx int, leftCtx string, rightCtx string) { + ts.Find(pattern).AssertSingleHit().AssertDocument(doc).AssertPosition(pos).AssertCtx(maxCtx, leftCtx, rightCtx) +} + +func (th *TestHit) AssertLinesAbove(linesAbove int, lines string) *TestHit { + ctx := th.hit.LineContext(linesAbove, 0) + actualLines := string(ctx.Before()) + if actualLines != lines { + th.t.Errorf("Expected lines above %v but got %v", lines, actualLines) + } + return th +} + +func (th *TestHit) AssertLinesBelow(linesBelow int, lines string) *TestHit { + ctx := th.hit.LineContext(0, linesBelow) + actualLines := string(ctx.After()) + if actualLines != lines { + th.t.Errorf("Expected lines below %v but got %v", lines, actualLines) + } + return th +} diff --git a/server/search/hit.go b/server/search/hit.go new file mode 100644 index 0000000..3fe20c5 --- /dev/null +++ b/server/search/hit.go @@ -0,0 +1,56 @@ +package search + +type HitStruct struct { + searchResult SearchResult + hitIdx int +} + +type HitContextStruct struct { + data []byte + position int32 + lenBefore int32 + lenPattern int32 + lenAfter int32 +} + +func (this *HitStruct) GlobalPosition() int { + return this.searchResult.globalPosition(this.hitIdx) +} + +func (this *HitStruct) Position() int { + return this.searchResult.position(this.hitIdx) +} + +func (this *HitStruct) Document() *Document { + return this.searchResult.document(this.hitIdx) +} + +func (this *HitStruct) CharContext(charsBefore, charsAfter int) HitContext { + return this.searchResult.charContext(this.hitIdx, charsAfter, charsAfter) +} + +func (this *HitStruct) LineContext(linesBefore, linesAfter int) HitContext { + return this.searchResult.lineContext(this.hitIdx, linesBefore, linesAfter) +} + +func (this *HitContextStruct) Before() []byte { + return this.data[this.position : this.position+this.lenBefore] +} + +func (this *HitContextStruct) Pattern() []byte { + start := this.position + this.lenBefore + return this.data[start : start+this.lenPattern] +} + +func (this *HitContextStruct) After() []byte { + start := this.position + this.lenBefore + this.lenPattern + return this.data[start : start+this.lenAfter] +} + +func (this *HitContextStruct) HighlightStart() int { + return int(this.lenBefore) +} + +func (this *HitContextStruct) HighlightEnd() int { + return int(this.lenBefore + this.lenPattern) +} diff --git a/server/search/multidoc.go b/server/search/multidoc.go new file mode 100644 index 0000000..5c90b1e --- /dev/null +++ b/server/search/multidoc.go @@ -0,0 +1,297 @@ +package search + +import ( + "fmt" + "sort" + + "github.com/mlinhard/exactly/server/esa" +) + +type MultiDocumentSearch struct { + esa *esa.EnhancedSuffixArray + offsets []int32 + ids []string + separator []byte + newLineInSeparator int32 +} + +type MultiDocumentSearchResult struct { + MultiDocumentSearch + interval esa.Interval + docIndexCache []int32 +} + +func toint32(a []int) []int32 { + r := make([]int32, len(a)) + for i := range a { + r[i] = int32(a[i]) + } + return r +} + +func NewMulti(combinedContent []byte, offsets []int, docIds []string) (*MultiDocumentSearch, error) { + return NewMulti32(combinedContent, toint32(offsets), docIds) +} + +func NewMulti32(combinedContent []byte, offsets []int32, docIds []string) (*MultiDocumentSearch, error) { + search := new(MultiDocumentSearch) + search.ids = docIds + search.offsets = offsets + esa, separator, err := esa.NewMulti(combinedContent, search.offsets) + if err != nil { + return nil, err + } + search.separator = separator + search.esa = esa + search.newLineInSeparator = newLineInSeparator(separator) + return search, nil +} + +func newLineInSeparator(separator []byte) int32 { + for i := int32(0); i < int32(len(separator)); i++ { + if isNewLine(separator, i) > 0 { + return i + } + } + return -1 +} + +func (this *MultiDocumentSearch) separatorAt(pos int32) bool { + data := this.esa.Data + separator := this.separator + lSeparator := int32(len(separator)) + if pos+lSeparator <= int32(len(data)) && pos >= int32(0) { + for i := int32(0); i < lSeparator; i++ { + if separator[i] != data[pos+i] { + return false + } + } + return true + } else { + return false + } +} + +func (this *MultiDocumentSearch) separatorAwareMatch(pattern []byte, dataOff int32, patternOff int32, mlen int32) bool { + data := this.esa.Data + for i := int32(0); i < mlen; i++ { + pIdx := patternOff + i + dIdx := dataOff + i + if pIdx >= int32(len(pattern)) || dIdx >= int32(len(data)) || pattern[pIdx] != data[dIdx] || this.separatorAt(dIdx) { + return false + } + } + return true +} + +func (search *MultiDocumentSearch) Find(pattern []byte) SearchResult { + interval := search.esa.Find(pattern, search.separatorAwareMatch) + if interval == nil { + return EmptySearchResult(pattern) + } + sr := new(MultiDocumentSearchResult) + sr.interval = *interval + sr.MultiDocumentSearch = *search + sr.docIndexCache = make([]int32, sr.interval.End-sr.interval.Start) + for i := range sr.docIndexCache { + sr.docIndexCache[i] = esa.UNDEF + } + return sr +} + +func (search *MultiDocumentSearch) DocumentCount() int { + return len(search.ids) +} + +func (search *MultiDocumentSearch) Document(idx int) *Document { + start := search.offsets[idx] + end := int32(len(search.esa.Data)) + if idx < len(search.offsets)-1 { + end = search.offsets[idx+1] - int32(len(search.separator)) + } + r := new(Document) + r.Content = search.esa.Data[start:end] + r.Id = search.ids[idx] + r.Index = idx + return r +} + +func (this *MultiDocumentSearchResult) IsEmpty() bool { + return false +} + +func (this *MultiDocumentSearchResult) Size() int { + return int(this.interval.End - this.interval.Start) +} + +func (this *MultiDocumentSearchResult) globalPosition(hitIdx int) int { + if hitIdx < 0 || int32(hitIdx) >= int32(this.Size()) { + panic(fmt.Sprintf("Hit index %v exceeds the search result size %v", hitIdx, this.Size())) + } + return int(this.esa.SA[this.interval.Start+int32(hitIdx)]) +} + +func (this *MultiDocumentSearchResult) position(hitIdx int) int { + return this.globalPosition(hitIdx) - int(this.offsets[this.documentIndex(hitIdx)]) +} + +func Search32(a []int32, n int32) int32 { + return int32(sort.Search(len(a), func(i int) bool { return a[i] > n })) +} + +func (this *MultiDocumentSearchResult) document(hitIdx int) *Document { + return this.Document(this.documentIndex(hitIdx)) +} + +func (this *MultiDocumentSearchResult) documentIndex(hitIdx int) int { + if this.docIndexCache[hitIdx] == esa.UNDEF { + pos := int32(this.globalPosition(hitIdx)) + r := Search32(this.offsets, pos) + this.docIndexCache[hitIdx] = r - 1 + } + return int(this.docIndexCache[hitIdx]) +} + +func (this *MultiDocumentSearchResult) Hit(hitIdx int) Hit { + return &HitStruct{this, hitIdx} +} + +func (this *MultiDocumentSearchResult) PatternLength() int { + return int(this.interval.Length) +} + +func (this *MultiDocumentSearchResult) Pattern() []byte { + patternStart := this.esa.SA[this.interval.Start] + return this.esa.Data[patternStart : patternStart+this.interval.Length] +} + +func (this *MultiDocumentSearchResult) HasGlobalPosition(position int) bool { + return false +} + +func (this *MultiDocumentSearchResult) HitWithGlobalPosition(position int) Hit { + return nil +} + +func (this *MultiDocumentSearchResult) HasPosition(document, position int) bool { + return false +} + +func (this *MultiDocumentSearchResult) HitWithPosition(document, position int) Hit { + return nil +} + +func (this *MultiDocumentSearchResult) Positions() []int { + r := make([]int, this.Size()) + for i := range r { + r[i] = int(this.position(i)) + } + return r +} + +func (this *MultiDocumentSearchResult) charContext(hitIndex int, charsBefore, charsAfter int) HitContext { + if charsBefore < 0 || charsAfter < 0 { + panic("Negative context length") + } + pos := int32(this.globalPosition(hitIndex)) + beforeStart := this.checkBefore(pos, int32(charsBefore)) + afterEnd := this.checkAfter(pos+this.interval.Length, int32(charsAfter)) + return &HitContextStruct{ + this.esa.Data, + beforeStart, + pos - beforeStart, + this.interval.Length, + afterEnd - pos - this.interval.Length} +} + +func (this *MultiDocumentSearchResult) lineContext(hitIndex int, linesBefore, linesAfter int) HitContext { + if linesBefore < 0 || linesAfter < 0 { + panic("Negative context length") + } + patternStart := int32(this.globalPosition(hitIndex)) + beforeStart := this.linesBeforeStart(hitIndex, linesBefore) + afterEnd := this.linesAfterStart(hitIndex, linesAfter) + return &HitContextStruct{ + this.esa.Data, + beforeStart, + patternStart - beforeStart, + this.interval.Length, + afterEnd - patternStart - this.interval.Length} +} + +func (this *MultiDocumentSearchResult) checkBefore(pos int32, maxSize int32) int32 { + leftLimit := checkBeforeSingle(pos, maxSize) + sepLen := int32(len(this.separator)) + for i := pos - sepLen; i >= leftLimit; i-- { + if this.separatorAt(i) { + return i + sepLen + } + } + return leftLimit +} + +func (this *MultiDocumentSearchResult) checkAfter(pos int32, maxSize int32) int32 { + rightLimit := checkAfterSingle(int32(len(this.esa.Data)), pos, maxSize) + sepLen := int32(len(this.separator)) + sepRightLimit := rightLimit - sepLen + for i := pos; i <= sepRightLimit; i++ { + if this.separatorAt(i) { + return i + } + } + return rightLimit +} + +func (this *MultiDocumentSearchResult) linesBeforeStart(hitIndex int, maxLines int) int32 { + j := int32(this.globalPosition(hitIndex)) + newLine := int32(0) + lineCount := int32(0) + sepLen := int32(len(this.separator)) + sep := this.separatorAt(j) + for j >= 0 && !sep && lineCount <= int32(maxLines) { + newLine = isNewLine(this.esa.Data, j) + if newLine > 0 { + lineCount++ + } + j-- + sep = this.separatorAt(j) + } + /* + * if separator is contained in (newLineInSeparator == -1) or equal + * to (newLineInSeparator == 0) newline sequence this means that the + * newline sequence never appears in the data. That means that + * isNewLine always returns 0, lineCount never increases and + * therefore the loop is ended only by the separator. in both cases + * we want to return j + 1 + separator.length + * + * if newLine is contained (but not equal) in the separator + * (newLineInSeparator > 0) we want to return + * + */ + newLineEnd := j + 1 + newLine + if this.newLineInSeparator == -1 { + return newLineEnd + ifelse(sep, sepLen-1, 0) + } else { + limit := ifelse(j-this.newLineInSeparator < 0, 0, j-this.newLineInSeparator) + for j >= limit && !sep { + j-- + sep = this.separatorAt(j) + } + return ifelse(sep, j+sepLen, newLineEnd) + } +} + +func (this *MultiDocumentSearchResult) linesAfterStart(hitIndex int, maxLines int) int32 { + j := int32(this.globalPosition(hitIndex)) + this.interval.Length + lineCount := int32(0) + dataLen := int32(len(this.esa.Data)) + sep := this.separatorAt(j) + for j < dataLen && !sep && lineCount <= int32(maxLines) { + if isNewLine(this.esa.Data, j) > 0 { + lineCount++ + } + j++ + sep = this.separatorAt(j) + } + return ifelse(j == dataLen || sep, j, j-1) +} diff --git a/server/search/search.go b/server/search/search.go new file mode 100644 index 0000000..eaccbef --- /dev/null +++ b/server/search/search.go @@ -0,0 +1,258 @@ +package search + +import ( + "fmt" + + "github.com/mlinhard/exactly/server/esa" +) + +type Document struct { + Index int + Id string + Content []byte +} + +type SingleDocumentSearch struct { + esa *esa.EnhancedSuffixArray + docId string +} + +type SingleDocumentSearchResult struct { + SingleDocumentSearch + interval esa.Interval +} + +type HitContext interface { + Before() []byte + Pattern() []byte + After() []byte + HighlightStart() int // Length of string returned by Before() method + HighlightEnd() int // Length of before string + length of pattern +} + +// Represents one occurrence of the pattern in the text composed of one or more documents +type Hit interface { + GlobalPosition() int // global position in concatenated string of all documents including separators (will never return position inside of the separator) + Position() int // position inside of the document, i.e. number of bytes from the document start. + Document() *Document // The document this hit was found in + CharContext(charsBefore, charsAfter int) HitContext // Context of the found pattern inside of the document given as number of characters + LineContext(linesBefore, linesAfter int) HitContext +} + +// Result of the search for pattern in the text indexed by Search +type SearchResult interface { + Size() int // Number of occurrences of the pattern found + IsEmpty() bool + Hit(i int) Hit + PatternLength() int // Length of the original pattern that we searched for. + Pattern() []byte // Pattern that we searched for. + HasGlobalPosition(position int) bool // True iff pattern was found on given position + HitWithGlobalPosition(position int) Hit + HasPosition(document, position int) bool + HitWithPosition(document, position int) Hit + Positions() []int + + document(hitIndex int) *Document + globalPosition(hitIndex int) int + position(hitIndex int) int + charContext(hitIndex int, charsBefore, charsAfter int) HitContext + lineContext(hitIndex int, linesBefore, linesAfter int) HitContext +} + +type Search interface { + DocumentCount() int + Document(i int) *Document + Find(pattern []byte) SearchResult +} + +func NewSingle(docId string, docContent []byte) (*SingleDocumentSearch, error) { + search := new(SingleDocumentSearch) + search.docId = docId + esa, err := esa.New(docContent) + if err != nil { + return nil, err + } + search.esa = esa + return search, nil +} + +func (*SingleDocumentSearch) DocumentCount() int { + return 1 +} + +func (search *SingleDocumentSearch) Document(idx int) *Document { + if idx != 0 { + panic("Single document search contains only index 0") + } + r := new(Document) + r.Content = search.esa.Data + r.Id = search.docId + r.Index = 0 + return r +} + +func (search *SingleDocumentSearch) Find(pattern []byte) SearchResult { + interval := search.esa.Find(pattern, search.esa.Match) + if interval == nil { + return EmptySearchResult(pattern) + } + sr := new(SingleDocumentSearchResult) + sr.interval = *interval + sr.SingleDocumentSearch = *search + return sr +} + +func (this *SingleDocumentSearchResult) IsEmpty() bool { + return false +} + +func (this *SingleDocumentSearchResult) Size() int { + return int(this.interval.End - this.interval.Start) +} + +func (this *SingleDocumentSearchResult) globalPosition(hitIdx int) int { + if hitIdx < 0 || int32(hitIdx) >= int32(this.Size()) { + panic(fmt.Sprintf("Hit index %v exceeds the search result size %v", hitIdx, this.Size())) + } + return int(this.esa.SA[this.interval.Start+int32(hitIdx)]) +} + +func (search *SingleDocumentSearchResult) document(hitIdx int) *Document { + return search.Document(0) +} + +func (this *SingleDocumentSearchResult) position(hitIdx int) int { + return this.globalPosition(hitIdx) +} + +func (this *SingleDocumentSearchResult) Hit(hitIdx int) Hit { + return &HitStruct{this, hitIdx} +} + +func (this *SingleDocumentSearchResult) PatternLength() int { + return int(this.interval.Length) +} + +func (this *SingleDocumentSearchResult) Pattern() []byte { + patternStart := this.esa.SA[this.interval.Start] + return this.esa.Data[patternStart : patternStart+this.interval.Length] +} + +func (this *SingleDocumentSearchResult) HasGlobalPosition(position int) bool { + return false +} + +func (this *SingleDocumentSearchResult) HitWithGlobalPosition(position int) Hit { + return nil +} + +func (this *SingleDocumentSearchResult) HasPosition(document, position int) bool { + return false +} + +func (this *SingleDocumentSearchResult) HitWithPosition(document, position int) Hit { + return nil +} + +func (this *SingleDocumentSearchResult) Positions() []int { + r := make([]int, this.Size()) + for i := range r { + r[i] = int(this.esa.SA[this.interval.Start+int32(i)]) + } + return r +} + +func ifelse(expr bool, onTrue int32, onFalse int32) int32 { + if expr { + return onTrue + } else { + return onFalse + } +} + +func checkBeforeSingle(pos int32, maxSize int32) int32 { + r := pos - maxSize + return ifelse(r < 0, 0, r) +} + +func checkAfterSingle(dataLen int32, pos int32, maxSize int32) int32 { + r := pos + maxSize + return ifelse(r > dataLen, dataLen, r) +} + +func (this *SingleDocumentSearchResult) charContext(hitIndex int, charsBefore, charsAfter int) HitContext { + if charsBefore < 0 || charsAfter < 0 { + panic("Negative context length") + } + pos := int32(this.globalPosition(hitIndex)) + beforeStart := checkBeforeSingle(pos, int32(charsBefore)) + afterEnd := checkAfterSingle(int32(len(this.esa.Data)), pos+this.interval.Length, int32(charsAfter)) + return &HitContextStruct{ + this.esa.Data, + beforeStart, + pos - beforeStart, + this.interval.Length, + afterEnd - pos - this.interval.Length} +} + +func isNewLine(data []byte, i int32) int32 { + ldata := int32(len(data)) + if i >= 0 && i < ldata { + c0 := data[i] + if c0 == 13 { + return ifelse(i == ldata-1 || data[i+1] != 10, 1, 2) + } else if c0 == 10 { + return ifelse(i == 0 || data[i-1] != 13, 1, 0) + } else { + return 0 + } + } else { + return 0 + } +} + +func (this *SingleDocumentSearchResult) isNewLine(i int32) int32 { + return isNewLine(this.esa.Data, i) +} + +func (this *SingleDocumentSearchResult) linesBeforeStart(hitIndex int, maxLines int) int32 { + j := int32(this.globalPosition(hitIndex)) + newLine := int32(0) + lineCount := int32(0) + for j >= 0 && lineCount <= int32(maxLines) { + newLine = this.isNewLine(j) + if newLine > 0 { + lineCount++ + } + j-- + } + return j + 1 + newLine +} + +func (this *SingleDocumentSearchResult) linesAfterStart(hitIndex int, maxLines int) int32 { + j := int32(this.globalPosition(hitIndex)) + this.interval.Length + lineCount := int32(0) + dataLength := int32(len(this.esa.Data)) + for j < dataLength && lineCount <= int32(maxLines) { + if this.isNewLine(j) > 0 { + lineCount++ + } + j++ + } + return ifelse(j == dataLength, j, j-1) +} + +func (this *SingleDocumentSearchResult) lineContext(hitIndex int, linesBefore, linesAfter int) HitContext { + if linesBefore < 0 || linesAfter < 0 { + panic("Negative context length") + } + patternStart := int32(this.globalPosition(hitIndex)) + beforeStart := this.linesBeforeStart(hitIndex, linesBefore) + afterEnd := this.linesAfterStart(hitIndex, linesAfter) + return &HitContextStruct{ + this.esa.Data, + beforeStart, + patternStart - beforeStart, + this.interval.Length, + afterEnd - patternStart - this.interval.Length} +} diff --git a/server/search/search_test.go b/server/search/search_test.go new file mode 100644 index 0000000..c02b2f2 --- /dev/null +++ b/server/search/search_test.go @@ -0,0 +1,241 @@ +package search + +import ( + "testing" +) + +func TestSearch32(t *testing.T) { + a := []int32{5, 10, 15} + assertEqual(t, a, 20, 3) + assertEqual(t, a, 16, 3) + assertEqual(t, a, 15, 3) + assertEqual(t, a, 14, 2) + assertEqual(t, a, 11, 2) + assertEqual(t, a, 10, 2) + assertEqual(t, a, 9, 1) + assertEqual(t, a, 6, 1) + assertEqual(t, a, 5, 1) + assertEqual(t, a, 4, 0) + assertEqual(t, a, 1, 0) + assertEqual(t, a, 0, 0) + assertEqual(t, a, -1, 0) +} +func TestSearch32_2(t *testing.T) { + a := []int32{0, 5, 10} + assertEqual(t, a, 10, 3) + assertEqual(t, a, 6, 2) + assertEqual(t, a, 5, 2) + assertEqual(t, a, 4, 1) + assertEqual(t, a, 1, 1) + assertEqual(t, a, 0, 1) +} + +func TestAbracadabra(t *testing.T) { + search := NewSearchIn(t, "abracadabra") + search.Find("abracadabra").AssertPositions(0) + search.Find("bracadabra").AssertPositions(1) + search.Find("racadabra").AssertPositions(2) + search.Find("acadabra").AssertPositions(3) + search.Find("cadabra").AssertPositions(4) + search.Find("adabra").AssertPositions(5) + search.Find("dabra").AssertPositions(6) + search.Find("abra").AssertPositions(7, 0) + search.Find("bra").AssertPositions(8, 1) + search.Find("ra").AssertPositions(9, 2) + search.Find("a").AssertPositions(10, 7, 0, 3, 5) + search.Find("b").AssertPositions(8, 1) + search.Find("c").AssertPositions(4) + search.Find("d").AssertPositions(6) + search.Find("r").AssertPositions(9, 2) +} + +func TestAcaaacatat(t *testing.T) { + search := NewSearchIn(t, "acaaacatat") + search.Find("acaaacatat").AssertPositions(0) + search.Find("caaacatat").AssertPositions(1) + search.Find("aaacatat").AssertPositions(2) + search.Find("aacatat").AssertPositions(3) + search.Find("acatat").AssertPositions(4) + search.Find("catat").AssertPositions(5) + search.Find("atat").AssertPositions(6) + search.Find("tat").AssertPositions(7) + search.Find("at").AssertPositions(8, 6) + search.Find("t").AssertPositions(9, 7) + + search.Find("acaaacatat").AssertPositions(0) + search.Find("acaaacata").AssertPositions(0) + search.Find("acaaacat").AssertPositions(0) + search.Find("acaaaca").AssertPositions(0) + search.Find("acaaac").AssertPositions(0) + search.Find("acaaa").AssertPositions(0) + search.Find("acaa").AssertPositions(0) + search.Find("aca").AssertPositions(0, 4) + search.Find("ac").AssertPositions(0, 4) + search.Find("a").AssertPositions(2, 3, 0, 4, 8, 6) + + search.Find("caaacatat").AssertPositions(1) + search.Find("caaacata").AssertPositions(1) + search.Find("caaacat").AssertPositions(1) + search.Find("caaaca").AssertPositions(1) + search.Find("caaac").AssertPositions(1) + search.Find("caaa").AssertPositions(1) + search.Find("caa").AssertPositions(1) + search.Find("ca").AssertPositions(1, 5) + search.Find("c").AssertPositions(1, 5) + + search.Find("aaacatat").AssertPositions(2) + search.Find("aaacata").AssertPositions(2) + search.Find("aaacat").AssertPositions(2) + search.Find("aaaca").AssertPositions(2) + search.Find("aaac").AssertPositions(2) + search.Find("aaa").AssertPositions(2) + search.Find("aa").AssertPositions(2, 3) + + search.Find("aacatat").AssertPositions(3) + search.Find("aacata").AssertPositions(3) + search.Find("aacat").AssertPositions(3) + search.Find("aaca").AssertPositions(3) + search.Find("aac").AssertPositions(3) + + search.Find("acatat").AssertPositions(4) + search.Find("acata").AssertPositions(4) + search.Find("acat").AssertPositions(4) + + search.Find("catat").AssertPositions(5) + search.Find("cata").AssertPositions(5) + search.Find("cat").AssertPositions(5) + + search.Find("atat").AssertPositions(6) + search.Find("ata").AssertPositions(6) +} + +func TestMississippi(t *testing.T) { + search := NewSearchIn(t, "mississippi") + search.Find("mississippi").AssertPositions(0) + search.Find("ississippi").AssertPositions(1) + search.Find("ssissippi").AssertPositions(2) + search.Find("sissippi").AssertPositions(3) + search.Find("issippi").AssertPositions(4) + search.Find("ssippi").AssertPositions(5) + search.Find("sippi").AssertPositions(6) + search.Find("ippi").AssertPositions(7) + search.Find("ppi").AssertPositions(8) + search.Find("pi").AssertPositions(9) + search.Find("i").AssertPositions(10, 7, 4, 1) + + search.Find("mississippi").AssertPositions(0) + search.Find("mississipp").AssertPositions(0) + search.Find("mississip").AssertPositions(0) + search.Find("mississi").AssertPositions(0) + search.Find("mississ").AssertPositions(0) + search.Find("missis").AssertPositions(0) + search.Find("missi").AssertPositions(0) + search.Find("miss").AssertPositions(0) + search.Find("mis").AssertPositions(0) + search.Find("mi").AssertPositions(0) + search.Find("m").AssertPositions(0) + + search.Find("ississippi").AssertPositions(1) + search.Find("ississipp").AssertPositions(1) + search.Find("ississip").AssertPositions(1) + search.Find("ississi").AssertPositions(1) + search.Find("ississ").AssertPositions(1) + search.Find("issis").AssertPositions(1) + search.Find("issi").AssertPositions(1, 4) + search.Find("iss").AssertPositions(1, 4) + search.Find("is").AssertPositions(1, 4) + + search.Find("ssissippi").AssertPositions(2) + search.Find("ssissipp").AssertPositions(2) + search.Find("ssissip").AssertPositions(2) + search.Find("ssissi").AssertPositions(2) + search.Find("ssiss").AssertPositions(2) + search.Find("ssis").AssertPositions(2) + search.Find("ssi").AssertPositions(2, 5) + search.Find("ss").AssertPositions(2, 5) + search.Find("s").AssertPositions(2, 3, 5, 6) + + search.Find("sissippi").AssertPositions(3) + search.Find("sissipp").AssertPositions(3) + search.Find("sissip").AssertPositions(3) + search.Find("sissi").AssertPositions(3) + search.Find("siss").AssertPositions(3) + search.Find("sis").AssertPositions(3) + search.Find("si").AssertPositions(3, 6) + + search.Find("issippi").AssertPositions(4) + search.Find("issipp").AssertPositions(4) + search.Find("issip").AssertPositions(4) + + search.Find("ssippi").AssertPositions(5) + search.Find("ssipp").AssertPositions(5) + search.Find("ssip").AssertPositions(5) + + search.Find("sippi").AssertPositions(6) + search.Find("sipp").AssertPositions(6) + search.Find("sip").AssertPositions(6) +} + +func TestJoin(t *testing.T) { + search := NewSearchIn(t, "abcde", "fghij", "klmno", "pqrst") + search.Find("defg").AssertPositions() + search.Find("abc").AssertSingleHit().AssertDocument(0).AssertPosition(0) + search.Find("fgh").AssertSingleHit().AssertDocument(1).AssertPosition(0) + search.Find("klm").AssertSingleHit().AssertDocument(2).AssertPosition(0) + search.Find("pqr").AssertSingleHit().AssertDocument(3).AssertPosition(0) + + search.AssertSingleHitCtx("bcd", 0, 1, 2, "a", "e") + search.AssertSingleHitCtx("ghi", 1, 1, 1, "f", "j") + search.AssertSingleHitCtx("lmn", 2, 1, 10, "k", "o") + search.AssertSingleHitCtx("qrs", 3, 1, 100, "p", "t") + + search.Find("abcde").AssertSingleHit().AssertDocument(0).AssertPosition(0) + search.Find("fghij").AssertSingleHit().AssertDocument(1).AssertPosition(0) + search.Find("klmno").AssertSingleHit().AssertDocument(2).AssertPosition(0) + search.Find("pqrst").AssertSingleHit().AssertDocument(3).AssertPosition(0) + + search.Find("abcde").AssertPositions(0) + search.Find("fghij").AssertPositions(0) + search.Find("klmno").AssertPositions(0) + search.Find("pqrst").AssertPositions(0) +} + +func TestLineContext(t *testing.T) { + search := NewSearchIn(t, "aaa\nbbb\nccc\nddd\neee") + result := search.Find("ccc") + hit := result.AssertSingleHit() + hit.AssertLinesAbove(0, "") + hit.AssertLinesAbove(1, "bbb\n") + hit.AssertLinesAbove(2, "aaa\nbbb\n") + hit.AssertLinesAbove(3, "aaa\nbbb\n") + hit.AssertLinesBelow(0, "") + hit.AssertLinesBelow(1, "\nddd") + hit.AssertLinesBelow(2, "\nddd\neee") + hit.AssertLinesBelow(3, "\nddd\neee") +} + +func TestLineContext2(t *testing.T) { + search := NewSearchIn(t, "aaa\nbbb\nccGGcc\nddd\neee") + result := search.Find("GG") + hit := result.AssertSingleHit() + hit.AssertLinesAbove(0, "cc") + hit.AssertLinesAbove(1, "bbb\ncc") + hit.AssertLinesAbove(2, "aaa\nbbb\ncc") + hit.AssertLinesAbove(3, "aaa\nbbb\ncc") + hit.AssertLinesBelow(0, "cc") + hit.AssertLinesBelow(1, "cc\nddd") + hit.AssertLinesBelow(2, "cc\nddd\neee") + hit.AssertLinesBelow(3, "cc\nddd\neee") +} + +func TestAAAA(t *testing.T) { + search := NewSearchIn(t, "aaaaaaaaaaaaaaaaaaaa") + search.Find("aaaa").AssertPositions(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16) +} + +func assertEqual(t *testing.T, a []int32, n int32, expectedR int32) { + r := Search32(a, n) + if r != expectedR { + t.Errorf("Search32(%v, %v) is %v not %v", a, n, r, expectedR) + } +} diff --git a/server/server/client_test.go b/server/server/client_test.go new file mode 100644 index 0000000..e8d520a --- /dev/null +++ b/server/server/client_test.go @@ -0,0 +1,168 @@ +package server + +import ( + "bytes" + "encoding/json" + "fmt" + "io/ioutil" + "net/http" + "testing" + + "github.com/mlinhard/exactly/server/search" +) + +type testClient struct { + t *testing.T + baseUrl string + maxContext int + maxHits int +} + +type testClientResponse struct { + client *testClient + response *SearchResponse +} + +type testClientHit struct { + response *testClientResponse + Index int + hit Hit +} + +func (this *testClient) DocumentCount() int { + return 1 +} + +func (this *testClient) Document(i int) *search.Document { + request := new(DocumentRequest) + response := new(DocumentResponse) + request.DocumentIndex = &i + err := this.postJson("/document", request, response) + if err != nil { + this.t.Error(err) + return nil + } + doc := new(search.Document) + doc.Content = response.Content + doc.Id = response.DocumentId + doc.Index = response.DocumentIndex + return doc +} + +func (this *testClient) search(pattern string) *testClientResponse { + return this.searchBounded(pattern, 0, this.maxContext, this.maxHits) +} + +func (this *testClient) searchBounded(pattern string, offset, maxContext, maxHits int) *testClientResponse { + request := new(SearchRequest) + response := new(SearchResponse) + request.MaxContext = maxContext + request.MaxHits = maxHits + request.Offset = &offset + request.Pattern = []byte(pattern) + err := this.postJson("/search", request, response) + if err != nil { + this.t.Error(err) + return nil + } + return &testClientResponse{this, response} +} + +func (this *testClient) postJson(relUrl string, request, response interface{}) error { + reqData, err := json.Marshal(request) + if err != nil { + this.t.Errorf("marshalling: %v", err) + return nil + } + var httpResp *http.Response + httpResp, err = http.Post(this.baseUrl+relUrl, "application/json", bytes.NewReader(reqData)) + if err != nil { + this.t.Errorf("http post: %v", err) + return nil + } + respData, err := ioutil.ReadAll(httpResp.Body) + fmt.Printf("received resp: %v", string(respData)) + err = json.Unmarshal(respData, response) + if err != nil { + this.t.Errorf("unmarshalling: %v", err) + } + return nil +} + +func (this *testClientResponse) AssertHitCount(hitCount int) { + if len(this.response.Hits) != hitCount { + this.client.t.Errorf("Unexpected hit count %v (expected %v)", len(this.response.Hits), hitCount) + } +} + +func (this *testClientResponse) AssertHitDocId(docId string) *testClientHit { + for hitIdx, hit := range this.response.Hits { + if hit.DocumentId == docId { + return &testClientHit{this, hitIdx, hit} + } + } + this.client.t.Errorf("Document ID %v not found", docId) + return nil +} + +func (this *testClientResponse) AssertNoCursor() { + c := this.response.Cursor + if c != nil { + this.client.t.Errorf("Expected no cursor, but cursor offset=%v, total=%v present", c.Offset, c.CompleteSize) + } +} + +func (this *testClientResponse) AssertCursor(offset, completeSize int) { + c := this.response.Cursor + if c == nil { + this.client.t.Errorf("Expected cursor offset=%v, total=%v not found", offset, completeSize) + } + if c.Offset != offset { + this.client.t.Errorf("Expected cursor offset %v but got %v", offset, c.Offset) + } + if c.CompleteSize != completeSize { + this.client.t.Errorf("Expected cursor total %v but got %v", completeSize, c.CompleteSize) + } +} + +func (this *testClientHit) AssertBefore(ctx string) *testClientHit { + if this == nil { + return nil + } + actualCtx := string(this.hit.ContextBefore) + if actualCtx != ctx { + this.response.client.t.Errorf("Expected before context %v and got %v", ctx, actualCtx) + } + return this +} + +func (this *testClientHit) AssertAfter(ctx string) *testClientHit { + if this == nil { + return nil + } + actualCtx := string(this.hit.ContextAfter) + if actualCtx != ctx { + this.response.client.t.Errorf("Expected after context %v and got %v", ctx, actualCtx) + } + return this +} + +func (this *testClientHit) AssertPosition(position int) *testClientHit { + if this == nil { + return nil + } + if this.hit.Position != position { + this.response.client.t.Errorf("Unexpected position %v in hit %v (expected %v)", this.hit.Position, this.Index, position) + } + return this +} + +func (this *testClientHit) AssertIndex(hitIdx int) *testClientHit { + if this == nil { + return nil + } + if this.Index != hitIdx { + this.response.client.t.Errorf("Unexpected hit index %v for doc %v (expected %v)", this.Index, this.hit.DocumentId, hitIdx) + } + return this +} diff --git a/server/server/config.go b/server/server/config.go new file mode 100644 index 0000000..746311c --- /dev/null +++ b/server/server/config.go @@ -0,0 +1,87 @@ +package server + +import ( + "encoding/json" + "fmt" + "io/ioutil" + "os" + "path/filepath" +) + +const ( + CONFIG_FOLDER = "exactly" // inside user's config dir + CONFIG_FILE = "server-config.json" // inside config folder +) + +type ServerConfig struct { + ListenAddress string `json:"listen_address"` + NumFileLoaders int `json:"num_file_loaders"` + NumFileStaters int `json:"num_file_staters"` + Roots []string `json:"roots"` + IgnoredDirs []string `json:"ignored_directories"` +} + +func saveConfigTo(configFile string, config *ServerConfig) error { + bytes, err := json.Marshal(config) + if err != nil { + return fmt.Errorf("Config marshalling: %v", err) + } + configDir := filepath.Dir(configFile) + err = os.MkdirAll(configDir, 0770) + if err != nil { + return fmt.Errorf("Couldn't create config dir: %v: %v", configDir, err) + } + err = ioutil.WriteFile(configFile, bytes, 0664) + if err != nil { + return fmt.Errorf("Config writing: %v", err) + } + return nil +} + +func LoadConfigFrom(configFile string) (*ServerConfig, error) { + bytes, err := ioutil.ReadFile(configFile) + if err != nil { + return nil, err + } + config := new(ServerConfig) + err = json.Unmarshal(bytes, config) + if err != nil { + return nil, err + } + return config, nil +} + +func getUserConfigFile() (string, error) { + // TODO: redo, once https://golang.org/pkg/os/#UserConfigDir is complete + homedir, err := os.UserHomeDir() + if err != nil { + return "", err + } + return filepath.Join(homedir, ".config", CONFIG_FOLDER, CONFIG_FILE), nil +} + +func LoadConfig() (*ServerConfig, error) { + configPath, err := getUserConfigFile() + if err != nil { + return nil, fmt.Errorf("Determining home directory: %v", err) + } + _, err = os.Stat(configPath) + var config *ServerConfig + if os.IsNotExist(err) { + config = defaultConfig() + err = saveConfigTo(configPath, config) + if err != nil { + return nil, fmt.Errorf("Saving newly created default config: %v", err) + } + } else { + config, err = LoadConfigFrom(configPath) + if err != nil { + return nil, fmt.Errorf("Loading config: %v", err) + } + } + return config, nil +} + +func defaultConfig() *ServerConfig { + return &ServerConfig{"localhost:8080", 4, 4, []string{"."}, []string{}} +} diff --git a/server/server/docstore.go b/server/server/docstore.go new file mode 100644 index 0000000..e5b30f5 --- /dev/null +++ b/server/server/docstore.go @@ -0,0 +1,188 @@ +package server + +import ( + "fmt" + "os" + "sync" + + "github.com/mlinhard/exactly/server/search" +) + +type DocumentStore struct { + documentIds []string + offsets []int32 + data []byte +} + +type loaderEntry struct { + idx int + path string + buffer []byte +} + +func LoadDocuments(fileWalk *FileWalk) (*DocumentStore, error) { + docStore := new(DocumentStore) + docStore.data = make([]byte, int(fileWalk.TotalBytes)) + docStore.offsets = computeOffsets(fileWalk.Entries) + docStore.documentIds = documentIds(fileWalk.Entries) + + entries := make(chan loaderEntry) + successes := make(chan int, PATH_BUFFER_SIZE) + errors := make(chan error, ERROR_BUFFER_SIZE) + wgCounter := new(sync.WaitGroup) + wgLoaders := new(sync.WaitGroup) + loaded := make([]bool, fileWalk.Size()) + + startLoaders(wgLoaders, entries, successes, errors, 4) + + wgCounter.Add(1) + go counter(wgCounter, successes, loaded) + + for i, entry := range fileWalk.Entries { + dataStart := docStore.offsets[i] + dataEnd := int32(len(docStore.data)) + if i < len(docStore.offsets)-1 { + dataEnd = docStore.offsets[i+1] + } + entries <- loaderEntry{i, entry.Path, docStore.data[dataStart:dataEnd]} + } + close(entries) + wgLoaders.Wait() + close(errors) + close(successes) + wgCounter.Wait() + + var errorbuf []error + for err := range errors { + errorbuf = append(errorbuf, err) + } + + if len(errorbuf) > 0 { + return nil, fmt.Errorf("Errors during file loading: %v", errorbuf) + } + + for i, flag := range loaded { + if !flag { + return nil, fmt.Errorf("File %v not loaded", docStore.documentIds[i]) + } + } + + return docStore, nil +} + +func computeOffsets(entries []FileEntry) []int32 { + offsets := make([]int32, len(entries)) + currentOffset := int32(0) + for i, entry := range entries { + offsets[i] = currentOffset + currentOffset += int32(entry.Size) + } + return offsets +} + +func documentIds(entries []FileEntry) []string { + docIds := make([]string, len(entries)) + for i, entry := range entries { + docIds[i] = entry.Path + } + return docIds +} + +func counter(wg *sync.WaitGroup, successes chan int, loaded []bool) { + defer wg.Done() + for successIdx := range successes { + loaded[successIdx] = true + } +} + +func startLoaders(wg *sync.WaitGroup, entries chan loaderEntry, successes chan int, errors chan error, numWorkers int) { + wg.Add(numWorkers) + for i := 0; i < numWorkers; i++ { + go loaderWorker(wg, entries, successes, errors) + } +} + +func loaderWorker(wg *sync.WaitGroup, entries chan loaderEntry, successes chan int, errors chan error) { + defer wg.Done() + for entry := range entries { + err := loadFile(entry.path, entry.buffer) + if err != nil { + errors <- err + } else { + successes <- entry.idx + } + } +} + +func loadFile(path string, buffer []byte) error { + f, err := os.Open(path) + if err != nil { + return err + } + defer f.Close() + bytesRead, err := f.Read(buffer) + if err != nil { + return err + } + if bytesRead != len(buffer) { + return fmt.Errorf("Unexpected length %v read from file %v, expected %v", bytesRead, path, len(buffer)) + } + return nil +} + +func (this *DocumentStore) Size() int { + return len(this.documentIds) +} + +func (this *DocumentStore) DocumentById(documentId string) *search.Document { + docIdx := this.IndexOf(documentId) + if docIdx == -1 { + return nil + } + return this.Document(docIdx) +} + +func (this *DocumentStore) Document(idx int) *search.Document { + if idx >= len(this.offsets) || idx >= len(this.documentIds) { + panic("Illegal document index") + } + doc := new(search.Document) + start := this.offsets[idx] + end := int32(len(this.data)) + if idx < len(this.offsets)-1 { + end = this.offsets[idx+1] + } + doc.Content = this.data[start:end] + doc.Id = this.documentIds[idx] + doc.Index = idx + return doc +} + +func (this *DocumentStore) Data() []byte { + return this.data +} + +func (this *DocumentStore) Offsets() []int32 { + return this.offsets +} + +func (this *DocumentStore) DocumentId(idx int) string { + return this.documentIds[idx] +} + +func (this *DocumentStore) TotalBytes() int { + return len(this.data) +} + +func (this *DocumentStore) IndexOf(documentId string) int { + for i, id := range this.documentIds { + if id == documentId { + return i + } + } + return -1 +} + +func (this *DocumentStore) ContainsId(documentId string) bool { + return this.IndexOf(documentId) != -1 +} diff --git a/server/server/docstore_test.go b/server/server/docstore_test.go new file mode 100644 index 0000000..caf7202 --- /dev/null +++ b/server/server/docstore_test.go @@ -0,0 +1,47 @@ +package server + +import "testing" + +type TestDocStore struct { + t *testing.T + tempDir *TempDir + docStore *DocumentStore +} + +func loadTestDocStore(tempDir *TempDir, testFileWalk *TestFileWalk) *TestDocStore { + tds := new(TestDocStore) + tds.t = testFileWalk.t + tds.tempDir = tempDir + docStore, err := LoadDocuments(testFileWalk.fileWalk) + if err != nil { + tds.t.Errorf("Error loading document store %v", err) + return nil + } + tds.docStore = docStore + return tds +} + +func (this *TestDocStore) assertTotalBytes(expectedTotalBytes int) { + if this.docStore.TotalBytes() != expectedTotalBytes { + this.t.Errorf("Unexpected document store data size: %v (expected %v)", this.docStore.TotalBytes(), expectedTotalBytes) + } +} + +func (this *TestDocStore) assertSize(expectedSize int) { + if this.docStore.Size() != expectedSize { + this.t.Errorf("Unexpected document store size: %v (expected %v)", this.docStore.Size(), expectedSize) + } +} + +func (this *TestDocStore) assertContains(relPath string, docContent string) { + docId := this.tempDir.Path(relPath) + doc := this.docStore.DocumentById(docId) + if doc == nil { + this.t.Errorf("Store doesn't contain document %v", docId) + return + } + actualContent := string(doc.Content) + if actualContent != docContent { + this.t.Errorf("Document %v content is %v (expected %v)", docId, actualContent, docContent) + } +} diff --git a/server/server/dto.go b/server/server/dto.go new file mode 100644 index 0000000..3280f1d --- /dev/null +++ b/server/server/dto.go @@ -0,0 +1,46 @@ +package server + +type SearchRequest struct { + Pattern []byte `json:"pattern"` + MaxHits int `json:"max_hits"` + MaxContext int `json:"max_context"` + Offset *int `json:"offset,omitempty"` +} + +type Hit struct { + Position int `json:"pos"` + DocumentId string `json:"doc_id"` + ContextBefore []byte `json:"ctx_before"` + ContextAfter []byte `json:"ctx_after"` +} + +type Cursor struct { + CompleteSize int `json:"complete_size"` + Offset int `json:"offset"` +} + +type SearchResponse struct { + Hits []Hit `json:"hits"` + Cursor *Cursor `json:"cursor,omitempty"` +} + +type SearchServerStats struct { + Version string `json:"version"` + IndexedBytes int `json:"indexed_bytes"` + IndexedFiles int `json:"indexed_files"` + DoneCrawling bool `json:"done_crawling"` + DoneLoading bool `json:"done_loading"` + DoneIndexing bool `json:"done_indexing"` + Errors []string `json:"errors,omitempty"` +} + +type DocumentRequest struct { + DocumentId *string `json:"document_id,omitempty"` + DocumentIndex *int `json:"document_index,omitempty"` +} + +type DocumentResponse struct { + DocumentId string `json:"document_id"` + DocumentIndex int `json:"document_index"` + Content []byte `json:"content"` +} diff --git a/server/server/filewalk.go b/server/server/filewalk.go new file mode 100644 index 0000000..983be82 --- /dev/null +++ b/server/server/filewalk.go @@ -0,0 +1,189 @@ +package server + +import ( + "os" + "path/filepath" + "sync" + + "github.com/karrick/godirwalk" +) + +const ( + PATH_BUFFER_SIZE = 1024 * 1024 + ERROR_BUFFER_SIZE = 1024 * 10 +) + +type FileEntry struct { + Path string + Size int64 +} + +type FileWalk struct { + Roots []string + Ignore []string + TotalBytes int64 + Entries []FileEntry + Errors []error +} + +func NewFileWalk(roots []string, ignore []string) *FileWalk { + fileWalk := new(FileWalk) + fileWalk.TotalBytes = 0 + fileWalk.Roots = roots + fileWalk.Ignore = ignore + paths := make(chan string, PATH_BUFFER_SIZE) + errors := make(chan error, ERROR_BUFFER_SIZE) + entries := make(chan FileEntry, PATH_BUFFER_SIZE) + wgCollectors := new(sync.WaitGroup) + wgCounter := new(sync.WaitGroup) + go fileWalk.entryCollector(wgCollectors, entries) + go fileWalk.errorCollector(wgCollectors, errors) + go countSizes(wgCounter, paths, entries, errors, 8) + walk(roots, ignore, paths, errors) + close(paths) + wgCounter.Wait() + close(entries) + close(errors) + wgCollectors.Wait() + return fileWalk +} + +func (this *FileWalk) entryCollector(wg *sync.WaitGroup, entries chan FileEntry) { + wg.Add(1) + defer wg.Done() + for entry := range entries { + this.Entries = append(this.Entries, entry) + this.TotalBytes += entry.Size + } +} + +func (this *FileWalk) errorCollector(wg *sync.WaitGroup, errors chan error) { + wg.Add(1) + defer wg.Done() + for err := range errors { + this.Errors = append(this.Errors, err) + } +} + +func countSizes(wg *sync.WaitGroup, paths chan string, entries chan FileEntry, errors chan error, numWorkers int) { + wg.Add(numWorkers) + for i := 0; i < numWorkers; i++ { + go countSizeWorker(wg, paths, errors, entries, i) + } +} + +func countSizeWorker(wg *sync.WaitGroup, paths chan string, errors chan error, entries chan FileEntry, i int) { + defer wg.Done() + for path := range paths { + info, err := os.Stat(path) + if err != nil { + errors <- err + } else { + entries <- FileEntry{path, info.Size()} + } + } +} + +func walk(roots []string, ignore []string, paths chan string, errors chan error) { + var wg sync.WaitGroup + wg.Add(len(roots)) + ignorer := createIgnorer(ignore...) + for _, root := range roots { + go walkOne(&wg, root, ignorer, paths, errors) + } + wg.Wait() +} + +type ignorer struct { + abs []string + rel []string +} + +func createIgnorer(ignore ...string) *ignorer { + ignorer := new(ignorer) + ignorer.rel, ignorer.abs = relAbsSplit(ignore) + return ignorer +} + +func relAbsSplit(ignore []string) ([]string, []string) { + var rel, abs []string + for _, dir := range ignore { + if filepath.IsAbs(dir) { + abs = append(abs, dir) + } else { + rel = append(rel, dir) + } + } + return rel, abs +} + +func startsWith(basepath, path string) bool { + if len(basepath) > len(path) { + return false + } + if basepath != path[:len(basepath)] { + return false + } + list := append(filepath.SplitList(basepath), filepath.SplitList(path[len(basepath):])...) + if path != filepath.Join(list...) { + return false + } + return true +} + +func (this *ignorer) match(path string) bool { + for _, ignorePath := range this.abs { + if startsWith(ignorePath, path) { + return true + } + } + base := filepath.Base(path) + for _, ignorePath := range this.rel { + if ignorePath == base { + return true + } + } + return false +} + +func walkOne(wg *sync.WaitGroup, root string, ignorer *ignorer, paths chan string, errors chan error) { + defer wg.Done() + err := godirwalk.Walk(root, &godirwalk.Options{ + Callback: func(osPathname string, de *godirwalk.Dirent) error { + if de.IsRegular() { + paths <- osPathname + } else if de.IsDir() && ignorer.match(osPathname) { + return filepath.SkipDir + } + return nil + }, + ErrorCallback: func(osPathname string, err error) godirwalk.ErrorAction { + errors <- err + return godirwalk.SkipNode + }, + }) + if err != nil { + errors <- err + } +} + +func (this *FileWalk) HasErrors() bool { + return len(this.Errors) > 0 +} + +func (this *FileWalk) Size() int { + return len(this.Entries) +} + +func (this *FileWalk) IndexOf(path string) int { + for i, entry := range this.Entries { + if entry.Path == path { + return i + } + } + return -1 +} + +func (this *FileWalk) Contains(path string) bool { + return this.IndexOf(path) != -1 +} diff --git a/server/server/filewalk_test.go b/server/server/filewalk_test.go new file mode 100644 index 0000000..b159a93 --- /dev/null +++ b/server/server/filewalk_test.go @@ -0,0 +1,26 @@ +package server + +import "testing" + +type TestFileWalk struct { + t *testing.T + fileWalk *FileWalk +} + +func (this *TestFileWalk) assertOK() { + if this.fileWalk.HasErrors() { + this.t.Errorf("File walk has errors %v", this.fileWalk.Errors) + } +} + +func (this *TestFileWalk) assertContains(path string) { + if this.fileWalk.Contains(path) { + this.t.Errorf("File walk should contain path %v", path) + } +} + +func (this *TestFileWalk) assertSize(expectedSize int) { + if this.fileWalk.Size() != expectedSize { + this.t.Errorf("unexpected walk size: %v", this.fileWalk.Size()) + } +} diff --git a/server/server/helper_test.go b/server/server/helper_test.go new file mode 100644 index 0000000..a0fe288 --- /dev/null +++ b/server/server/helper_test.go @@ -0,0 +1,69 @@ +package server + +import ( + "fmt" + "testing" +) + +type testServer struct { + t *testing.T + server *Server +} + +func (this *testServer) client() *testClient { + client := new(testClient) + client.t = this.t + client.baseUrl = fmt.Sprintf("http://%v", this.server.config.ListenAddress) + client.maxContext = 10 + client.maxHits = 10 + return client +} + +func createTestServerConfig(roots []string, ignore []string) *ServerConfig { + config := new(ServerConfig) + config.ListenAddress = "localhost:9876" + config.Roots = roots + config.IgnoredDirs = ignore + config.NumFileLoaders = 4 + config.NumFileStaters = 4 + return config +} + +func createTestServer(t *testing.T, config *ServerConfig) *testServer { + server := new(testServer) + server.t = t + server.server = NewServer("TEST", config) + server.server.OnError(func(err error) { + t.Errorf("ERROR: %v", err) + }) + return server +} + +func createTestFiles(tmpDir *TempDir) { + tmpDir.WriteFile("docs/ignored1/file01.txt", "ignored1") + tmpDir.WriteFile("docs/ignored1/file02.txt", "ignored2") + tmpDir.WriteFile("docs/ignored1/file03.txt", "ignored3") + tmpDir.WriteFile("docs/file01.txt", "AAAA") + tmpDir.WriteFile("docs/file02.txt", "BBBB") + tmpDir.WriteFile("docs/file03.txt", "CCCC") + tmpDir.WriteFile("docs/bla1/file04.txt", "DDDD") + tmpDir.WriteFile("docs/bla1/file05.txt", "EEABC") + tmpDir.WriteFile("docs/bla2/file06.txt", "FFABC") + tmpDir.WriteFile("docs/bla2/file07.txt", "GGABC") + tmpDir.WriteFile("texts/file08.txt", "HHHH") + tmpDir.WriteFile("texts/file09.txt", "IIII") + tmpDir.WriteFile("texts/bla3/file10.txt", "JJJJ") + tmpDir.WriteFile("texts/ignored2/file04.txt", "ignored4") +} + +func assertIgnored(t *testing.T, ign *ignorer, path string) { + if !ign.match(path) { + t.Errorf("Path %v should be ignored", path) + } +} + +func assertNotIgnored(t *testing.T, ign *ignorer, path string) { + if ign.match(path) { + t.Errorf("Path %v should not be ignored", path) + } +} diff --git a/server/server/indexer.go b/server/server/indexer.go new file mode 100644 index 0000000..522fea2 --- /dev/null +++ b/server/server/indexer.go @@ -0,0 +1,146 @@ +package server + +import ( + "fmt" + + "github.com/mlinhard/exactly/server/search" +) + +type Indexer struct { + version string + config *ServerConfig + doneCrawling bool + doneLoading bool + doneIndexing bool + errors []string + fileWalk *FileWalk + docStore *DocumentStore + search search.Search +} + +func NewIndexer(version string, config *ServerConfig) *Indexer { + indexer := new(Indexer) + indexer.version = version + indexer.config = config + return indexer +} + +func (this *Indexer) start(listeners *listeners) { + this.fileWalk = NewFileWalk(this.config.Roots, this.config.IgnoredDirs) + if this.fileWalk.HasErrors() { + this.errorState(this.fileWalk.Errors...) + listeners.Error(fmt.Errorf("filewalk: %v", this.fileWalk.Errors)) + return + } + this.doneCrawling = true + listeners.DoneCrawling(this.fileWalk.Size(), int(this.fileWalk.TotalBytes)) + docStore, err := LoadDocuments(this.fileWalk) + if err != nil { + this.errorState(err) + listeners.Error(err) + return + } + this.docStore = docStore + this.doneLoading = true + listeners.DoneLoading(this.fileWalk.Size(), int(this.fileWalk.TotalBytes)) + var search search.Search + search, err = createSearch(this.docStore) + if err != nil { + this.errorState(err) + listeners.Error(err) + return + } + this.search = search + this.doneIndexing = true + listeners.DoneIndexing(this.fileWalk.Size(), int(this.fileWalk.TotalBytes)) +} + +func createSearch(docStore *DocumentStore) (search.Search, error) { + if docStore.Size() < 1 { + return nil, fmt.Errorf("No documents to index") + } + if docStore.Size() == 1 { + return search.NewSingle(docStore.DocumentId(0), docStore.Data()) + } + return search.NewMulti32(docStore.Data(), docStore.offsets, docStore.documentIds) +} + +func (this *Indexer) errorState(errors ...error) { + this.errors = make([]string, len(errors)) + for i, err := range errors { + this.errors[i] = fmt.Sprintf("%v", err) + } +} + +func (this *Indexer) Search(request *SearchRequest) *SearchResponse { + response := new(SearchResponse) + result := this.search.Find(request.Pattern) + start := 0 + cursor := new(Cursor) + if request.Offset != nil { + start = *request.Offset + } + end := result.Size() + cursor.Offset = start + cursor.CompleteSize = end + userLimit := start + request.MaxHits + if userLimit < end { + end = userLimit + } + response.Hits = make([]Hit, end-start) + for i := start; i < end; i++ { + hit := result.Hit(i) + hitContext := hit.CharContext(request.MaxContext, request.MaxContext) + var respHit Hit + respHit.Position = hit.Position() + respHit.DocumentId = hit.Document().Id + respHit.ContextBefore = hitContext.Before() + respHit.ContextAfter = hitContext.After() + response.Hits[i-start] = respHit + } + if cursor.Offset != 0 || cursor.CompleteSize > len(response.Hits) { + response.Cursor = cursor + } + return response +} + +func (this *Indexer) Document(request *DocumentRequest) (*DocumentResponse, error) { + if request.DocumentId == nil && request.DocumentIndex == nil { + return nil, fmt.Errorf("You have to specify document index or document id") + } + resp := new(DocumentResponse) + var doc *search.Document + if request.DocumentIndex == nil { + doc = this.docStore.DocumentById(*request.DocumentId) + if doc == nil { + return nil, nil + } + } else { + docIdx := *request.DocumentIndex + if docIdx < 0 || docIdx >= this.docStore.Size() { + return nil, fmt.Errorf("Document index %v out of range", docIdx) + } + doc = this.docStore.Document(docIdx) + } + resp.Content = doc.Content + resp.DocumentId = doc.Id + resp.DocumentIndex = doc.Index + return resp, nil +} + +func (this *Indexer) getStats() *SearchServerStats { + stats := new(SearchServerStats) + stats.Version = this.version + stats.DoneCrawling = this.doneCrawling + stats.DoneLoading = this.doneLoading + stats.DoneIndexing = this.doneIndexing + stats.Errors = this.errors + if this.doneLoading { + stats.IndexedBytes = this.docStore.TotalBytes() + stats.IndexedFiles = this.docStore.Size() + } else { + stats.IndexedFiles = 0 + stats.IndexedBytes = 0 + } + return stats +} diff --git a/server/server/listeners.go b/server/server/listeners.go new file mode 100644 index 0000000..339d392 --- /dev/null +++ b/server/server/listeners.go @@ -0,0 +1,48 @@ +package server + +type listeners struct { + onDoneCrawling []func(int, int) + onDoneLoading []func(int, int) + onDoneIndexing []func(int, int) + onError []func(error) +} + +func (this *listeners) OnDoneCrawling(listener func(int, int)) { + this.onDoneCrawling = append(this.onDoneCrawling, listener) +} + +func (this *listeners) OnDoneLoading(listener func(int, int)) { + this.onDoneLoading = append(this.onDoneLoading, listener) +} + +func (this *listeners) OnDoneIndexing(listener func(int, int)) { + this.onDoneIndexing = append(this.onDoneIndexing, listener) +} + +func (this *listeners) OnError(listener func(error)) { + this.onError = append(this.onError, listener) +} + +func (this *listeners) DoneCrawling(fileCount, byteCount int) { + for _, listener := range this.onDoneCrawling { + go listener(fileCount, byteCount) + } +} + +func (this *listeners) DoneLoading(fileCount, byteCount int) { + for _, listener := range this.onDoneLoading { + go listener(fileCount, byteCount) + } +} + +func (this *listeners) DoneIndexing(fileCount, byteCount int) { + for _, listener := range this.onDoneIndexing { + go listener(fileCount, byteCount) + } +} + +func (this *listeners) Error(err error) { + for _, listener := range this.onError { + go listener(err) + } +} diff --git a/server/server/server.go b/server/server/server.go new file mode 100644 index 0000000..4c5ad02 --- /dev/null +++ b/server/server/server.go @@ -0,0 +1,149 @@ +package server + +import ( + "context" + "encoding/json" + "fmt" + "net/http" + "time" +) + +type Server struct { + version string + config *ServerConfig + httpServer *http.Server + indexer *Indexer + listeners *listeners +} + +func NewServer(version string, config *ServerConfig) *Server { + server := new(Server) + server.version = version + server.config = config + http.Handle("/", http.NotFoundHandler()) + http.HandleFunc("/search", server.handleSearch) + http.HandleFunc("/stats", server.handleStats) + http.HandleFunc("/document", server.handleDocument) + http.HandleFunc("/version", server.handleVersion) + server.httpServer = &http.Server{ + Addr: config.ListenAddress, + Handler: nil, + } + server.indexer = NewIndexer(server.version, config) + server.listeners = new(listeners) + return server +} + +func (this *Server) OnDoneCrawling(listener func(int, int)) { + this.listeners.OnDoneCrawling(listener) +} + +func (this *Server) OnDoneLoading(listener func(int, int)) { + this.listeners.OnDoneLoading(listener) +} + +func (this *Server) OnDoneIndexing(listener func(int, int)) { + this.listeners.OnDoneIndexing(listener) +} + +func (this *Server) OnError(listener func(error)) { + this.listeners.OnError(listener) +} + +func (this *Server) Start() { + go this.indexer.start(this.listeners) + go this.listenAndServe() +} + +func (this *Server) Stop() { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer func() { + // extra handling here + cancel() + }() + err := this.httpServer.Shutdown(ctx) + if err != nil { + fmt.Printf("HTTP Server stop: %v", err) + } +} + +func (this *Server) listenAndServe() { + err := this.httpServer.ListenAndServe() + if err != nil { + fmt.Printf("HTTP Server listen: %v", err) + } +} + +func (this *Server) handleDocument(w http.ResponseWriter, r *http.Request) { + if r.Method == "POST" { + docRequest := new(DocumentRequest) + err := json.NewDecoder(r.Body).Decode(docRequest) + if err != nil { + http.Error(w, fmt.Sprintf("Decoding request: %v", err), http.StatusBadRequest) + } else { + response, err, status := this.handleDocumentRequest(docRequest) + if err != nil { + http.Error(w, fmt.Sprintf("Processing request: %v", err), status) + } + json.NewEncoder(w).Encode(response) + } + } else { + http.Error(w, fmt.Sprintf("Method %v not supported", r.Method), http.StatusMethodNotAllowed) + } +} + +func (this *Server) handleDocumentRequest(request *DocumentRequest) (*DocumentResponse, error, int) { + if this.indexer == nil || !this.indexer.getStats().DoneIndexing { + return nil, fmt.Errorf("Indexer not ready"), http.StatusBadRequest + } + doc, err := this.indexer.Document(request) + if err != nil { + return nil, err, http.StatusBadRequest + } + return doc, nil, http.StatusOK +} + +func (this *Server) handleSearch(w http.ResponseWriter, r *http.Request) { + if r.Method == "POST" { + searchRequest := new(SearchRequest) + err := json.NewDecoder(r.Body).Decode(searchRequest) + // bytes, err := ioutil.ReadAll(r.Body) + // fmt.Printf("RECEIVED SEARCH: %v\n", string(bytes)) + // json.Unmarshal(bytes, searchRequest) + if err != nil { + http.Error(w, fmt.Sprintf("Decoding request: %v", err), http.StatusBadRequest) + } else { + response, err, status := this.handleSearchRequest(searchRequest) + if err != nil { + http.Error(w, fmt.Sprintf("Processing request: %v", err), status) + } + json.NewEncoder(w).Encode(response) + } + } else { + http.Error(w, fmt.Sprintf("Method %v not supported", r.Method), http.StatusMethodNotAllowed) + } +} + +func (this *Server) handleSearchRequest(request *SearchRequest) (*SearchResponse, error, int) { + if this.indexer == nil || !this.indexer.getStats().DoneIndexing { + return nil, fmt.Errorf("Indexer not ready"), http.StatusBadRequest + } + if len(request.Pattern) == 0 { + return nil, fmt.Errorf("You have to specify non-empty pattern"), http.StatusBadRequest + } + return this.indexer.Search(request), nil, http.StatusOK +} + +func (this *Server) handleStats(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode(this.getStats()) +} + +func (this *Server) handleVersion(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "text/plain") + fmt.Fprintf(w, "%v", this.version) +} + +func (this *Server) getStats() *SearchServerStats { + return this.indexer.getStats() +} diff --git a/server/server/server_test.go b/server/server/server_test.go new file mode 100644 index 0000000..9966bb8 --- /dev/null +++ b/server/server/server_test.go @@ -0,0 +1,181 @@ +package server + +import ( + "sync" + "testing" +) + +func TestConfigSaveLoad(t *testing.T) { + tmpDir := NewTempDir(t, "exactly-index-test") + if !tmpDir.OK { + return + } + defer tmpDir.Remove() + tmpConfigFile := tmpDir.Path("folder1/folder2/test-config.json") + config := &ServerConfig{ + "192.168.10.10:9090", + 3, + 6, + []string{"/home/mlinhard/Documents1", "/home/mlinhard/Documents2"}, + []string{".git"}} + err := saveConfigTo(tmpConfigFile, config) + if err != nil { + t.Error(err) + return + } + var loadedConfig *ServerConfig + loadedConfig, err = LoadConfigFrom(tmpConfigFile) + if err != nil { + t.Error(err) + return + } + if loadedConfig.ListenAddress != "192.168.10.10:9090" { + t.Errorf("Loaded wrong listen_address: %v", config.ListenAddress) + } + if loadedConfig.NumFileLoaders != 3 { + t.Errorf("Loaded wrong num_file_loaders: %v", config.ListenAddress) + } + if loadedConfig.NumFileStaters != 6 { + t.Errorf("Loaded wrong nim_file_staters: %v", config.ListenAddress) + } + if len(loadedConfig.Roots) != 2 || loadedConfig.Roots[0] != "/home/mlinhard/Documents1" || loadedConfig.Roots[1] != "/home/mlinhard/Documents2" { + t.Errorf("Loaded wrong roots: %v", config.Roots) + } + if len(loadedConfig.IgnoredDirs) != 1 || loadedConfig.IgnoredDirs[0] != ".git" { + t.Errorf("Loaded wrong ignored_directories: %v", config.IgnoredDirs) + } +} + +func TestFileWalkDocumentLoad(t *testing.T) { + tmpDir := NewTempDir(t, "exactly-test") + if !tmpDir.OK { + return + } + defer tmpDir.Remove() + createTestFiles(tmpDir) + if !tmpDir.OK { + return + } + roots := tmpDir.Paths("docs", "texts") + ignored := []string{tmpDir.Path("docs/ignored1"), "ignored2"} + fileWalk := &TestFileWalk{t, NewFileWalk(roots, ignored)} + fileWalk.assertOK() + fileWalk.assertSize(10) + fileWalk.assertContains("docs/file01.txt") + fileWalk.assertContains("docs/file02.txt") + fileWalk.assertContains("docs/file03.txt") + fileWalk.assertContains("docs/bla1/file04.txt") + fileWalk.assertContains("docs/bla1/file05.txt") + fileWalk.assertContains("docs/bla2/file06.txt") + fileWalk.assertContains("docs/bla2/file07.txt") + fileWalk.assertContains("texts/file08.txt") + fileWalk.assertContains("texts/file09.txt") + fileWalk.assertContains("texts/bla3/file10.txt") + + docStore := loadTestDocStore(tmpDir, fileWalk) + docStore.assertTotalBytes(43) + docStore.assertSize(10) + docStore.assertContains("docs/file01.txt", "AAAA") + docStore.assertContains("docs/file02.txt", "BBBB") + docStore.assertContains("docs/file03.txt", "CCCC") + docStore.assertContains("docs/bla1/file04.txt", "DDDD") + docStore.assertContains("docs/bla1/file05.txt", "EEABC") + docStore.assertContains("docs/bla2/file06.txt", "FFABC") + docStore.assertContains("docs/bla2/file07.txt", "GGABC") + docStore.assertContains("texts/file08.txt", "HHHH") + docStore.assertContains("texts/file09.txt", "IIII") + docStore.assertContains("texts/bla3/file10.txt", "JJJJ") +} + +func TestStartsWith(t *testing.T) { + if startsWith("haha", "/home/mlinhard") { + t.Errorf("error") + } + if startsWith("/home/mli", "/home/mlinhard") { + t.Errorf("error") + } + if !startsWith("/home", "/home/mlinhard") { + t.Errorf("error") + } + if !startsWith("/home/mlinhard", "/home/mlinhard") { + t.Errorf("error") + } +} + +func TestIgnorer(t *testing.T) { + ignorer := createIgnorer("ign1", "/home/docs/ign2") + assertIgnored(t, ignorer, "ign1") + assertIgnored(t, ignorer, "/home/ign1") + assertNotIgnored(t, ignorer, "/home/ign1/bla") + assertIgnored(t, ignorer, "/home/docs/ign2") + assertIgnored(t, ignorer, "/home/docs/ign2/subdir") + assertIgnored(t, ignorer, "/home/docs/ign2/subdir/aa") + assertNotIgnored(t, ignorer, "/home/bla") + assertNotIgnored(t, ignorer, "/home/ing2/docs") + assertNotIgnored(t, ignorer, "ign2") +} + +func TestSearchServer(t *testing.T) { + tmpDir := NewTempDir(t, "exactly-test") + if !tmpDir.OK { + return + } + defer tmpDir.Remove() + createTestFiles(tmpDir) + if !tmpDir.OK { + return + } + roots := tmpDir.Paths("docs", "texts") + ignore := []string{tmpDir.Path("docs/ignored1"), "ignored2"} + config := createTestServerConfig(roots, ignore) + testServer := createTestServer(t, config) + testServer.server.Start() + defer testServer.server.Stop() + wg := new(sync.WaitGroup) + wg.Add(1) + testServer.server.OnDoneIndexing(func(int, int) { + wg.Done() + }) + client := testServer.client() + wg.Wait() + resp := client.search("ABC") + if resp == nil { + return + } + resp.AssertHitCount(3) + file05 := tmpDir.Path("docs/bla1/file05.txt") + file06 := tmpDir.Path("docs/bla2/file06.txt") + file07 := tmpDir.Path("docs/bla2/file07.txt") + files := make([]string, 3) + files[resp.AssertHitDocId(file05).Index] = file05 + files[resp.AssertHitDocId(file06).Index] = file06 + files[resp.AssertHitDocId(file07).Index] = file07 + + resp.AssertHitDocId(file05).AssertBefore("EE").AssertAfter("").AssertPosition(2) + resp.AssertHitDocId(file06).AssertBefore("FF").AssertAfter("").AssertPosition(2) + resp.AssertHitDocId(file07).AssertBefore("GG").AssertAfter("").AssertPosition(2) + resp.AssertNoCursor() + + resp.AssertHitDocId(files[0]).AssertIndex(0) + resp.AssertHitDocId(files[1]).AssertIndex(1) + resp.AssertHitDocId(files[2]).AssertIndex(2) + + resp = client.searchBounded("ABC", 0, 10, 2) + if resp == nil { + return + } + resp.AssertHitCount(2) + resp.AssertHitDocId(files[0]) + resp.AssertHitDocId(files[1]) + resp.AssertCursor(0, 3) + + resp = client.searchBounded("ABC", 1, 10, 2) + if resp == nil { + return + } + resp.AssertHitCount(2) + resp.AssertHitDocId(files[1]) + resp.AssertHitDocId(files[2]) + resp.AssertCursor(1, 3) + +} diff --git a/server/server/tmpdir_test.go b/server/server/tmpdir_test.go new file mode 100644 index 0000000..be12be0 --- /dev/null +++ b/server/server/tmpdir_test.go @@ -0,0 +1,63 @@ +package server + +import ( + "io/ioutil" + "os" + "path/filepath" + "testing" +) + +type TempDir struct { + t *testing.T + dir string + OK bool +} + +func NewTempDir(t *testing.T, prefix string) *TempDir { + td := new(TempDir) + td.t = t + dir, err := ioutil.TempDir(os.TempDir(), "exactly-index-test") + if err != nil { + t.Error(err) + td.OK = false + } else { + td.dir = dir + td.OK = true + } + return td +} +func (this *TempDir) WriteFile(relpath string, content string) { + dir := filepath.Dir(this.Path(relpath)) + err := os.MkdirAll(dir, 0770) + if err != nil { + this.t.Errorf("Creating dir: %v", err) + this.OK = false + return + } + err = ioutil.WriteFile(this.Path(relpath), []byte(content), 0664) + if err != nil { + this.t.Errorf("Writing file: %v", err) + this.OK = false + } +} + +func (this *TempDir) Remove() { + err := os.RemoveAll(this.dir) + if err != nil { + this.t.Error(err) + } +} + +func (this *TempDir) Paths(relpaths ...string) []string { + r := make([]string, len(relpaths)) + for i, relpath := range relpaths { + r[i] = this.Path(relpath) + } + return r +} + +func (this *TempDir) Path(relpath string) string { + parts := filepath.SplitList(relpath) + path := append([]string{string(this.dir)}, parts...) + return filepath.Join(path...) +} diff --git a/server/src/main/java/sk/linhard/exactly/Document.java b/server/src/main/java/sk/linhard/exactly/Document.java deleted file mode 100644 index ed89813..0000000 --- a/server/src/main/java/sk/linhard/exactly/Document.java +++ /dev/null @@ -1,11 +0,0 @@ -package sk.linhard.exactly; - -public interface Document { - - int index(); - - String id(); - - TContent content(); - -} diff --git a/server/src/main/java/sk/linhard/exactly/Hit.java b/server/src/main/java/sk/linhard/exactly/Hit.java deleted file mode 100644 index 35a11ad..0000000 --- a/server/src/main/java/sk/linhard/exactly/Hit.java +++ /dev/null @@ -1,43 +0,0 @@ -package sk.linhard.exactly; - -/** - * Represents one occurrence of the pattern in the text composed of one or more - * documents - */ -public interface Hit { - - /** - * @return global position in concatenated string of all documents including - * separators (will never return position inside of the separator) - */ - int globalPosition(); - - /** - * @return position inside of the document, i.e. number of bytes from the - * document start. - */ - int position(); - - /** - * @return The document this hit was found in - */ - Document document(); - - /** - * Context of the found pattern inside of the document given as number of - * characters. - * - * @param charsBefore - * Number of characters / bytes to get. If the position - - * charsBefore is before document start will return characters - * from the beginning of the document - * @param charsAfter - * @return - */ - HitContext charContext(int charsBefore, int charsAfter); - - HitContext safeCharContext(int charsBefore, int charsAfter); - - HitContext lineContext(int linesBefore, int linesAfter); - -} \ No newline at end of file diff --git a/server/src/main/java/sk/linhard/exactly/HitContext.java b/server/src/main/java/sk/linhard/exactly/HitContext.java deleted file mode 100644 index 5bd4bc4..0000000 --- a/server/src/main/java/sk/linhard/exactly/HitContext.java +++ /dev/null @@ -1,28 +0,0 @@ -package sk.linhard.exactly; - -/** - * Represents text context around the match. This can be lines or characters - * around the pattern. You usually want to display the whole string before + - * pattern + after with highlighted pattern. - * - * @param - */ -public interface HitContext { - - TContent before(); - - TContent pattern(); - - TContent after(); - - /** - * @return Length of string returned by {@link #before()} method - */ - int highlightStart(); - - /** - * - * @return Length of before string + length of pattern - */ - int highlightEnd(); -} diff --git a/server/src/main/java/sk/linhard/exactly/Search.java b/server/src/main/java/sk/linhard/exactly/Search.java deleted file mode 100644 index 20de46b..0000000 --- a/server/src/main/java/sk/linhard/exactly/Search.java +++ /dev/null @@ -1,24 +0,0 @@ -package sk.linhard.exactly; - -public interface Search { - - /** - * @return Number of documents indexed - */ - int documentCount(); - - /** - * @param i - * @return i-th document - */ - Document document(int i); - - /** - * Find occurences of pattern in text. - * - * @param pattern - * @return - */ - SearchResult find(TContent pattern); - -} diff --git a/server/src/main/java/sk/linhard/exactly/SearchBuilder.java b/server/src/main/java/sk/linhard/exactly/SearchBuilder.java deleted file mode 100644 index c7bd0a1..0000000 --- a/server/src/main/java/sk/linhard/exactly/SearchBuilder.java +++ /dev/null @@ -1,148 +0,0 @@ -package sk.linhard.exactly; - -import java.io.File; -import java.io.FileInputStream; -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; - -import org.apache.commons.io.IOUtils; - -import sk.linhard.exactly.impl.DefaultSearch; -import sk.linhard.exactly.impl.MultiDocumentSearch; - -public class SearchBuilder { - - private List documents = new ArrayList<>(); - - public void add(String id, byte[] content) { - documents.add(new BytesCopier(id, content)); - } - - public void add(String id, File file, int fileLength) { - documents.add(new FileReader(id, file, fileLength)); - } - - public int size() { - return documents.size(); - } - - public int totalLength() { - return documents.stream().map(d -> d.length()).reduce(0, (a, b) -> a + b); - } - - public Search build() { - return build(null); - } - - public Search build(byte[] separator) { - if (documents.isEmpty()) { - throw new IllegalStateException("No documents"); - } else if (documents.size() == 1) { - return DefaultSearch.compute(documents.get(0)); - } else { - byte[] data = new byte[totalLength()]; - int[] offsets = new int[documents.size()]; - String[] ids = new String[documents.size()]; - int offset = 0; - for (int i = 0; i < offsets.length; i++) { - DocumentReader reader = documents.get(i); - int read = reader.read(data, offset); - if (read != reader.length()) { - throw new RuntimeException("Read unexpected length " + read + " of document " + reader.id()); - } - offsets[i] = offset; - offset += read; - ids[i] = reader.id(); - } - if (separator == null) { - return MultiDocumentSearch.compute(data, offsets, ids); - } else { - return MultiDocumentSearch.compute(data, offsets, ids, separator); - } - } - } - - private static abstract class DocumentReader implements Document { - - private String id; - - public DocumentReader(String id) { - this.id = id; - } - - public abstract int read(byte[] buffer, int offset); - - public abstract int length(); - - @Override - public int index() { - return -1; - } - - @Override - public String id() { - return id; - } - } - - private static class BytesCopier extends DocumentReader { - - private byte[] content; - - public BytesCopier(String id, byte[] content) { - super(id); - this.content = content; - } - - @Override - public byte[] content() { - return content; - } - - @Override - public int length() { - return content.length; - } - - @Override - public int read(byte[] buffer, int offset) { - System.arraycopy(content, 0, buffer, offset, content.length); - return content.length; - } - - } - - private static class FileReader extends DocumentReader { - private File file; - private int length; - - public FileReader(String id, File file, int length) { - super(id); - this.file = file; - this.length = length; - } - - @Override - public int read(byte[] buffer, int offset) { - try (FileInputStream fInputStream = new FileInputStream(file)) { - return IOUtils.read(fInputStream, buffer, offset, length); - } catch (IOException e) { - throw new RuntimeException("Error reading file " + file.getAbsolutePath(), e); - } - } - - @Override - public byte[] content() { - byte[] content = new byte[length]; - read(content, 0); - return content; - } - - @Override - public int length() { - return length; - } - } - -} diff --git a/server/src/main/java/sk/linhard/exactly/SearchResult.java b/server/src/main/java/sk/linhard/exactly/SearchResult.java deleted file mode 100644 index 94292b0..0000000 --- a/server/src/main/java/sk/linhard/exactly/SearchResult.java +++ /dev/null @@ -1,52 +0,0 @@ -package sk.linhard.exactly; - -import java.util.List; - -import sk.linhard.exactly.impl.DefaultSearch; - -/** - * Result of the search for pattern in the text indexed by {@link DefaultSearch} - */ -public interface SearchResult extends Iterable> { - - /** - * @return Number of occurrences of the pattern found - */ - int size(); - - boolean isEmpty(); - - /** - * @param i - * @return i-th hit (occurence of pattern) - */ - Hit hit(int i); - - List> hits(); - - /** - * - * @return Length of the original pattern that we searched for. - */ - int patternLength(); - - /** - * - * @return Pattern that we searched for. - */ - TContent pattern(); - - /** - * @param position - * @return True iff pattern was found on given position - */ - boolean hasGlobalPosition(int position); - - Hit hitWithGlobalPosition(int position); - - boolean hasPosition(int document, int position); - - Hit hitWithPosition(int document, int position); - - Iterable> skipIterator(int offset); -} diff --git a/server/src/main/java/sk/linhard/exactly/StringSearchBuilder.java b/server/src/main/java/sk/linhard/exactly/StringSearchBuilder.java deleted file mode 100644 index 8055848..0000000 --- a/server/src/main/java/sk/linhard/exactly/StringSearchBuilder.java +++ /dev/null @@ -1,318 +0,0 @@ -package sk.linhard.exactly; - -import static java.util.stream.Collectors.toList; - -import java.io.File; -import java.nio.charset.Charset; -import java.util.Iterator; -import java.util.List; - -/** - * So far we only take into account strings where one character is encoded as - * one byte, i.e. string length = byte array length - * - */ -public class StringSearchBuilder { - - private EncodingContext encoding; - private SearchBuilder searchBuilder; - - public StringSearchBuilder() { - this(Charset.forName("UTF-8")); - } - - public StringSearchBuilder(Charset charset) { - this.encoding = new EncodingContext(charset); - this.searchBuilder = new SearchBuilder(); - } - - public void add(String id, String content) { - searchBuilder.add(id, encoding.toBytes(content)); - } - - public void add(String id, File file, int fileLength) { - searchBuilder.add(id, file, fileLength); - } - - public int size() { - return searchBuilder.size(); - } - - public int totalLength() { - return searchBuilder.totalLength(); - } - - public Search build() { - return build(null); - } - - public Search build(byte[] separator) { - return new StringSearch(encoding, buildBinary(separator)); - } - - public Search buildBinary() { - return buildBinary(null); - } - - public Search buildBinary(byte[] separator) { - return searchBuilder.build(separator); - } - - static class EncodingContext { - private Charset charset; - - public EncodingContext(Charset charset) { - this.charset = charset; - } - - public Charset charset() { - return charset; - } - - public String toString(byte[] bytes) { - return new String(bytes, charset); - } - - public byte[] toBytes(String string) { - return string.getBytes(charset); - } - - } - - static class StringDocument implements Document { - - private EncodingContext encoding; - private Document document; - - public StringDocument(EncodingContext encoding, Document document) { - this.encoding = encoding; - this.document = document; - } - - @Override - public int index() { - return document.index(); - } - - @Override - public String id() { - return document.id(); - } - - @Override - public String content() { - return encoding.toString(document.content()); - } - - } - - static class StringSearch implements Search { - - private EncodingContext encoding; - private Search search; - - StringSearch(EncodingContext encoding, Search search) { - this.encoding = encoding; - this.search = search; - } - - @Override - public SearchResult find(String pattern) { - return new StringSearchResult(encoding, search.find(encoding.toBytes(pattern))); - } - - @Override - public int documentCount() { - return search.documentCount(); - } - - @Override - public Document document(int i) { - return new StringDocument(encoding, search.document(i)); - } - - } - - static class StringSearchResult implements SearchResult { - - private EncodingContext encoding; - private SearchResult searchResult; - - StringSearchResult(EncodingContext encoding, SearchResult searchResult) { - this.encoding = encoding; - this.searchResult = searchResult; - } - - @Override - public Iterator> iterator() { - return new StringHitIterator(encoding, searchResult.iterator()); - } - - @Override - public int size() { - return searchResult.size(); - } - - @Override - public boolean isEmpty() { - return searchResult.isEmpty(); - } - - @Override - public Hit hit(int i) { - return new StringHit(encoding, searchResult.hit(i)); - } - - @Override - public List> hits() { - return searchResult.hits().stream().map(h -> new StringHit(encoding, h)).collect(toList()); - } - - @Override - public int patternLength() { - return searchResult.patternLength(); - } - - @Override - public String pattern() { - return encoding.toString(searchResult.pattern()); - } - - @Override - public boolean hasGlobalPosition(int position) { - return searchResult.hasGlobalPosition(position); - } - - @Override - public Hit hitWithGlobalPosition(int position) { - return new StringHit(encoding, searchResult.hitWithGlobalPosition(position)); - } - - @Override - public boolean hasPosition(int document, int position) { - return searchResult.hasPosition(document, position); - } - - @Override - public Hit hitWithPosition(int document, int position) { - return new StringHit(encoding, searchResult.hitWithPosition(document, position)); - } - - @Override - public Iterable> skipIterator(int offset) { - return new Iterable>() { - - @Override - public Iterator> iterator() { - return new StringHitIterator(encoding, searchResult.skipIterator(offset).iterator()); - } - }; - } - - } - - static class StringHit implements Hit { - - private EncodingContext encoding; - private Hit hit; - - StringHit(EncodingContext encoding, Hit hit) { - super(); - this.encoding = encoding; - this.hit = hit; - } - - @Override - public String toString() { - return hit.toString(); - } - - @Override - public int globalPosition() { - return hit.globalPosition(); - } - - @Override - public int position() { - return hit.position(); - } - - @Override - public Document document() { - return new StringDocument(encoding, hit.document()); - } - - @Override - public HitContext charContext(int charsBefore, int charsAfter) { - return new StringHitContext(encoding, hit.charContext(charsBefore, charsAfter)); - } - - @Override - public HitContext safeCharContext(int charsBefore, int charsAfter) { - return new StringHitContext(encoding, hit.safeCharContext(charsBefore, charsAfter)); - } - - @Override - public HitContext lineContext(int linesBefore, int linesAfter) { - return new StringHitContext(encoding, hit.lineContext(linesBefore, linesAfter)); - } - - } - - static class StringHitContext implements HitContext { - private EncodingContext encoding; - private HitContext hitCtx; - - StringHitContext(EncodingContext encoding, HitContext hitCtx) { - this.encoding = encoding; - this.hitCtx = hitCtx; - } - - @Override - public String before() { - return encoding.toString(hitCtx.before()); - } - - @Override - public String pattern() { - return encoding.toString(hitCtx.pattern()); - } - - @Override - public String after() { - return encoding.toString(hitCtx.after()); - } - - @Override - public int highlightStart() { - return hitCtx.highlightStart(); - } - - @Override - public int highlightEnd() { - return hitCtx.highlightEnd(); - } - - } - - static class StringHitIterator implements Iterator> { - private EncodingContext encoding; - private Iterator> hitIterator; - - StringHitIterator(EncodingContext encoding, Iterator> hitIterator) { - this.encoding = encoding; - this.hitIterator = hitIterator; - } - - @Override - public boolean hasNext() { - return hitIterator.hasNext(); - } - - @Override - public Hit next() { - return new StringHit(encoding, hitIterator.next()); - } - - } -} diff --git a/server/src/main/java/sk/linhard/exactly/gui/ContentPanel.java b/server/src/main/java/sk/linhard/exactly/gui/ContentPanel.java deleted file mode 100644 index 2b40fb8..0000000 --- a/server/src/main/java/sk/linhard/exactly/gui/ContentPanel.java +++ /dev/null @@ -1,85 +0,0 @@ -package sk.linhard.exactly.gui; - -import java.awt.BorderLayout; -import java.awt.Color; -import java.awt.Dimension; -import java.awt.event.ActionEvent; -import java.awt.event.KeyEvent; - -import javax.swing.AbstractAction; -import javax.swing.ActionMap; -import javax.swing.InputMap; -import javax.swing.JComponent; -import javax.swing.JLabel; -import javax.swing.JPanel; -import javax.swing.JScrollPane; -import javax.swing.JTextArea; -import javax.swing.KeyStroke; -import javax.swing.SwingConstants; -import javax.swing.border.EmptyBorder; -import javax.swing.text.BadLocationException; -import javax.swing.text.DefaultHighlighter; - -import sk.linhard.exactly.HitContext; - -public class ContentPanel extends JPanel { - - private Core core; - private JTextArea contentTextPane; - private JLabel lblFileName; - - public ContentPanel(Core core) { - this.core = core; - setLayout(new BorderLayout()); - - JPanel panSearchBar = new JPanel(); - panSearchBar.setBorder(new EmptyBorder(2, 2, 2, 2)); - panSearchBar.setPreferredSize(new Dimension(0, 40)); - add(panSearchBar, BorderLayout.NORTH); - - lblFileName = new JLabel("Unknown file"); - lblFileName.setVerticalAlignment(SwingConstants.CENTER); - lblFileName.setBorder(new EmptyBorder(2, 2, 2, 2)); - lblFileName.setSize(100, 50); - panSearchBar.add(lblFileName); - - contentTextPane = new JTextArea(); - - JScrollPane scrollPane = new JScrollPane(contentTextPane); - // scrollPane.set - scrollPane.setVerticalScrollBarPolicy(JScrollPane.VERTICAL_SCROLLBAR_ALWAYS); - // scrollPane.setPreferredSize(new Dimension(250, 155)); - scrollPane.setMinimumSize(new Dimension(10, 10)); - - contentTextPane.setFont(EntryTable.FONT); - - InputMap im = getInputMap(JComponent.WHEN_IN_FOCUSED_WINDOW); - ActionMap am = getActionMap(); - - im.put(KeyStroke.getKeyStroke(KeyEvent.VK_ESCAPE, 0), "onEsc"); - - am.put("onEsc", new AbstractAction() { - - @Override - public void actionPerformed(ActionEvent e) { - ContentPanel.this.core.escapeItem(); - } - }); - - add(scrollPane, BorderLayout.CENTER); - } - - public void setContent(SearchResultItem selectedItem) { - lblFileName.setText(selectedItem.file()); - HitContext ctx = selectedItem.lineContext(10); - contentTextPane.setText(ctx.before() + ctx.pattern() + ctx.after()); - try { - contentTextPane.getHighlighter().removeAllHighlights(); - contentTextPane.getHighlighter().addHighlight(ctx.highlightStart(), ctx.highlightEnd(), - new DefaultHighlighter.DefaultHighlightPainter(new Color(255, 128, 128))); - } catch (BadLocationException e) { - throw new RuntimeException(e); - } - } - -} diff --git a/server/src/main/java/sk/linhard/exactly/gui/Core.java b/server/src/main/java/sk/linhard/exactly/gui/Core.java deleted file mode 100644 index bca7f49..0000000 --- a/server/src/main/java/sk/linhard/exactly/gui/Core.java +++ /dev/null @@ -1,116 +0,0 @@ -package sk.linhard.exactly.gui; - -import java.awt.EventQueue; -import java.io.File; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; - -import javax.swing.JFileChooser; -import javax.swing.UIManager; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.google.common.collect.ImmutableList; - -import sk.linhard.exactly.Search; -import sk.linhard.exactly.SearchResult; -import sk.linhard.exactly.impl.FileLoader; -import sk.linhard.exactly.impl.IndexingProgressReporter; -import sk.linhard.exactly.impl.IndexingProgressReporter.IndexingProgress; - -public class Core { - - private static final Logger log = LoggerFactory.getLogger(Core.class); - private MainWindow mainFrame; - private Search search; - private ExecutorService executor; - - private Core() { - executor = Executors.newCachedThreadPool(); - } - - public static void main(String[] args) { - new Core().start(); - } - - public SearchResult find(String query) { - return search.find(query); - } - - public void start() { - log.debug("Starting"); - EventQueue.invokeLater(() -> { - try { - UIManager.setLookAndFeel(UIManager.getSystemLookAndFeelClassName()); - } catch (Exception e) { - log.error("Error while changing look and feel", e); - } - mainFrame = new MainWindow(this); - mainFrame.setVisible(true); - }); - } - - private void indexFolder(File folder) { - IndexingProgressReporter reporter = new IndexingProgressReporter(); - FileLoader fileLoader = new FileLoader(ImmutableList.of(folder), reporter); - executor.submit(() -> { - checkFileLoaderProgress(reporter, fileLoader); - }); - fileLoader.crawl(); - fileLoader.load(); - search = fileLoader.index(); - } - - private void checkFileLoaderProgress(IndexingProgressReporter reporter, FileLoader fileLoader) { - EventQueue.invokeLater(() -> { - mainFrame.reportRoot(fileLoader.getRoots().iterator().next()); - }); - boolean done = false; - while (!done) { - IndexingProgress progress = reporter.getProgress(); - done = progress.isDoneIndexing(); - EventQueue.invokeLater(() -> { - mainFrame.reportProgress(progress); - }); - try { - Thread.sleep(100); - } catch (InterruptedException e) { - log.debug("Interrupted for some reason"); - } - } - } - - private void onFolderSelected(File folder) { - if (folder != null && folder.exists() && folder.isDirectory()) { - log.debug("Loading: " + folder.getAbsolutePath()); - executor.submit(() -> { - indexFolder(folder); - }); - } else { - log.debug("Not a folder: {}", folder == null ? "null" : folder.getAbsolutePath()); - } - } - - public void onMenuActionIndexFolder() { - JFileChooser fc = new JFileChooser(); - fc.setDialogTitle("Select root directory to index"); - fc.setFileSelectionMode(JFileChooser.DIRECTORIES_ONLY); - log.debug("Menu action performed"); - int returnVal = fc.showOpenDialog(mainFrame); - if (returnVal == JFileChooser.APPROVE_OPTION) { - onFolderSelected(fc.getSelectedFile()); - } else { - log.debug("Open command cancelled by user."); - } - } - - public void selectItem(SearchResultItem selectedItem) { - mainFrame.selectItem(selectedItem); - } - - public void escapeItem() { - mainFrame.exitHit(); - } - -} diff --git a/server/src/main/java/sk/linhard/exactly/gui/EntryPanel.java b/server/src/main/java/sk/linhard/exactly/gui/EntryPanel.java deleted file mode 100644 index bbabc65..0000000 --- a/server/src/main/java/sk/linhard/exactly/gui/EntryPanel.java +++ /dev/null @@ -1,104 +0,0 @@ -package sk.linhard.exactly.gui; - -import java.awt.GridLayout; -import java.awt.event.ActionEvent; -import java.awt.event.KeyEvent; -import java.io.File; -import java.util.ArrayList; - -import javax.swing.AbstractAction; -import javax.swing.ActionMap; -import javax.swing.InputMap; -import javax.swing.JComponent; -import javax.swing.JPanel; -import javax.swing.JScrollPane; -import javax.swing.KeyStroke; -import javax.swing.ListSelectionModel; -import javax.swing.border.EmptyBorder; -import javax.swing.event.ListSelectionEvent; -import javax.swing.event.ListSelectionListener; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import sk.linhard.exactly.SearchResult; -import sk.linhard.exactly.impl.IndexingProgressReporter; - -public class EntryPanel extends JPanel implements ListSelectionListener { - - private static final Logger log = LoggerFactory.getLogger(EntryPanel.class); - - private EntryTable entryTable; - private JScrollPane entryTableScrollPane; - private LoadingPanel waitingPanel; - private SearchResultItem selectedItem; - private int maxCtx = 40; - - public EntryPanel(Core core) { - super(); - - setLayout(new GridLayout(0, 1, 0, 0)); - setBorder(new EmptyBorder(4, 4, 4, 4)); - - entryTable = new EntryTable(new ArrayList<>()); - entryTable.setRowHeight(20); - entryTable.setFillsViewportHeight(true); - - ListSelectionModel selmodel = entryTable.getSelectionModel(); - selmodel.setSelectionMode(ListSelectionModel.SINGLE_SELECTION); - selmodel.addListSelectionListener(this); - - InputMap im = entryTable.getInputMap(JComponent.WHEN_FOCUSED); - ActionMap am = entryTable.getActionMap(); - - im.put(KeyStroke.getKeyStroke(KeyEvent.VK_ENTER, 0), "onEnter"); - - am.put("onEnter", new AbstractAction() { - - @Override - public void actionPerformed(ActionEvent e) { - if (selectedItem != null) { - log.debug("NOW ENTERING: {}", selectedItem.matchLine(maxCtx)); - core.selectItem(selectedItem); - } - } - }); - - entryTableScrollPane = new JScrollPane(entryTable); - - waitingPanel = new LoadingPanel(); - - setSearchResult(null); - } - - public void setSearchResult(SearchResult searchResult) { - remove(entryTableScrollPane); - remove(waitingPanel); - if (searchResult == null) { - add(waitingPanel); - } else { - entryTable.setSearchResult(searchResult); - add(entryTableScrollPane); - } - updateUI(); - } - - @Override - public void valueChanged(ListSelectionEvent e) { - int selectedRow = entryTable.getSelectedRow(); - if (!e.getValueIsAdjusting() && selectedRow != -1) { - selectedItem = entryTable.getEntryAt(selectedRow); - log.debug("Selected entry: '{}'", selectedItem.matchLine(maxCtx)); - } - } - - public void reportRoot(File file) { - setSearchResult(null); - waitingPanel.reportRoot(file); - } - - public void reportProgress(IndexingProgressReporter.IndexingProgress progress) { - waitingPanel.reportProgress(progress); - } - -} diff --git a/server/src/main/java/sk/linhard/exactly/gui/EntryTable.java b/server/src/main/java/sk/linhard/exactly/gui/EntryTable.java deleted file mode 100644 index 5ee7e99..0000000 --- a/server/src/main/java/sk/linhard/exactly/gui/EntryTable.java +++ /dev/null @@ -1,130 +0,0 @@ -package sk.linhard.exactly.gui; - -import java.awt.Color; -import java.awt.Component; -import java.awt.Font; -import java.util.List; - -import javax.swing.JTable; -import javax.swing.JTextArea; -import javax.swing.table.AbstractTableModel; -import javax.swing.table.DefaultTableCellRenderer; -import javax.swing.table.TableCellRenderer; -import javax.swing.table.TableColumn; -import javax.swing.table.TableColumnModel; -import javax.swing.text.BadLocationException; -import javax.swing.text.DefaultHighlighter; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import sk.linhard.exactly.SearchResult; - -public class EntryTable extends JTable { - - private static final Logger log = LoggerFactory.getLogger(EntryTable.class); - - private static final String[] COLNAMES = { "Match" }; - - private static final int[] COLWIDTH = { 960 }; - private int maxCtx = 40; - - static final Font FONT = createFont(); - - public EntryTable(List entries) { - super(new EntryTableModel(entries)); - - TableColumnModel cm = getColumnModel(); - for (int i = 0; i < COLWIDTH.length; i++) { - TableColumn col = cm.getColumn(i); - col.setPreferredWidth(COLWIDTH[i]); - } - } - - private static Font createFont() { - return new Font("Liberation Mono", Font.PLAIN, 16); - } - - private static final Color BG_NORMAL = new Color(255, 255, 255); - private static final Color BG_SELECTED = new Color(128, 128, 255); - private static final Color HIGHLIGHT = new Color(255, 128, 128); - - private class MatchCellRendererComponent extends JTextArea { - - public MatchCellRendererComponent(SearchResultItem item, boolean isSelected) { - try { - setBackground(isSelected ? BG_SELECTED : BG_NORMAL); - setFont(FONT); - setText(item.matchLine(maxCtx)); - getHighlighter().addHighlight(maxCtx, maxCtx + item.patternLength(), - new DefaultHighlighter.DefaultHighlightPainter(HIGHLIGHT)); - } catch (BadLocationException e) { - throw new RuntimeException(e); - } - } - } - - private class MatchCellRenderer extends DefaultTableCellRenderer { - @Override - public Component getTableCellRendererComponent(JTable table, Object value, boolean isSelected, boolean hasFocus, - int row, int column) { - if (column == 0) { - return new MatchCellRendererComponent((SearchResultItem) value, isSelected); - } else { - return super.getTableCellRendererComponent(table, value, isSelected, hasFocus, row, column); - } - } - } - - @Override - public TableCellRenderer getCellRenderer(int row, int column) { - if (column == 0) { - return new MatchCellRenderer(); - } - return super.getCellRenderer(row, column); - } - - @Override - public EntryTableModel getModel() { - return (EntryTableModel) super.getModel(); - } - - public SearchResultItem getEntryAt(int row) { - return getModel().entries.get(row); - } - - public void setSearchResult(SearchResult searchResult) { - List items = SearchResultItem.toItems(100, searchResult); - log.debug("Replacing table with {} new entries", items.size()); - setModel(new EntryTableModel(items)); - } - - private static class EntryTableModel extends AbstractTableModel { - - private List entries; - - public EntryTableModel(List entries) { - this.entries = entries; - } - - @Override - public String getColumnName(int column) { - return COLNAMES[column]; - } - - @Override - public int getRowCount() { - return entries.size(); - } - - @Override - public int getColumnCount() { - return COLNAMES.length; - } - - @Override - public Object getValueAt(int rowIndex, int columnIndex) { - return entries.get(rowIndex); - } - } -} diff --git a/server/src/main/java/sk/linhard/exactly/gui/LoadingPanel.java b/server/src/main/java/sk/linhard/exactly/gui/LoadingPanel.java deleted file mode 100644 index c2c7191..0000000 --- a/server/src/main/java/sk/linhard/exactly/gui/LoadingPanel.java +++ /dev/null @@ -1,126 +0,0 @@ -package sk.linhard.exactly.gui; - -import java.awt.BorderLayout; -import java.awt.GridBagConstraints; -import java.awt.GridBagLayout; -import java.io.File; - -import javax.swing.JLabel; -import javax.swing.JPanel; -import javax.swing.SwingConstants; -import javax.swing.border.Border; -import javax.swing.border.EmptyBorder; - -import sk.linhard.exactly.impl.IndexingProgressReporter; - -public class LoadingPanel extends JPanel { - - private JLabel label; - private JLabel textRoot; - private JLabel textFiles; - private JLabel textBytes; - - public LoadingPanel() { - setLayout(new BorderLayout(0, 0)); - - JPanel centerPanel = new JPanel(); - add(centerPanel, BorderLayout.CENTER); - GridBagLayout centerPanelLayout = new GridBagLayout(); - centerPanelLayout.columnWidths = new int[] { 100, 300 }; - centerPanelLayout.rowHeights = new int[] { 20, 20, 20, 20 }; - centerPanelLayout.columnWeights = new double[] { 0.0, Double.MIN_VALUE }; - centerPanelLayout.rowWeights = new double[] { 0.0, 0.0, 0.0, 0.0 }; - centerPanel.setLayout(centerPanelLayout); - - label = new JLabel("Please select a folder to index"); - label.setHorizontalAlignment(SwingConstants.CENTER); - Border labelBorder = new EmptyBorder(4, 4, 4, 4); - label.setBorder(labelBorder); - GridBagConstraints labelGBC = new GridBagConstraints(); - labelGBC.gridx = 0; - labelGBC.gridy = 0; - labelGBC.gridwidth = 2; - labelGBC.gridheight = 1; - - JLabel labelRoot = new JLabel("Root"); - labelRoot.setBorder(labelBorder); - GridBagConstraints labelRootGBC = new GridBagConstraints(); - labelRootGBC.gridx = 0; - labelRootGBC.gridy = 1; - labelRootGBC.anchor = GridBagConstraints.WEST; - - JLabel labelFiles = new JLabel("Files"); - labelFiles.setBorder(labelBorder); - GridBagConstraints labelFilesGBC = new GridBagConstraints(); - labelFilesGBC.gridx = 0; - labelFilesGBC.gridy = 2; - labelFilesGBC.anchor = GridBagConstraints.WEST; - - JLabel labelBytes = new JLabel("Bytes"); - labelBytes.setBorder(labelBorder); - labelBytes.setHorizontalTextPosition(JLabel.LEFT); - GridBagConstraints labelBytesGBC = new GridBagConstraints(); - labelBytesGBC.gridx = 0; - labelBytesGBC.gridy = 3; - labelBytesGBC.anchor = GridBagConstraints.WEST; - - textRoot = new JLabel("-"); - textRoot.setBorder(labelBorder); - GridBagConstraints textRootGBC = new GridBagConstraints(); - textRootGBC.gridx = 1; - textRootGBC.gridy = 1; - textRootGBC.anchor = GridBagConstraints.CENTER; - - textFiles = new JLabel("-"); - textFiles.setHorizontalAlignment(SwingConstants.RIGHT); - textFiles.setBorder(labelBorder); - GridBagConstraints textFilesGBC = new GridBagConstraints(); - textFilesGBC.gridx = 1; - textFilesGBC.gridy = 2; - textFilesGBC.anchor = GridBagConstraints.EAST; - - textBytes = new JLabel("-"); - textBytes.setHorizontalAlignment(SwingConstants.RIGHT); - textBytes.setBorder(labelBorder); - GridBagConstraints textBytesGBC = new GridBagConstraints(); - textBytesGBC.gridx = 1; - textBytesGBC.gridy = 3; - textBytesGBC.anchor = GridBagConstraints.EAST; - - centerPanel.add(label, labelGBC); - centerPanel.add(labelRoot, labelRootGBC); - centerPanel.add(labelFiles, labelFilesGBC); - centerPanel.add(labelBytes, labelBytesGBC); - centerPanel.add(textRoot, textRootGBC); - centerPanel.add(textFiles, textFilesGBC); - centerPanel.add(textBytes, textBytesGBC); - - } - - public void reportRoot(File file) { - textRoot.setText(file.getAbsolutePath()); - } - - public void reportProgress(IndexingProgressReporter.IndexingProgress progress) { - if (!progress.isDoneCrawling()) { - label.setText("Analysing files ..."); - textFiles.setText(progress.getFormattedCrawlingProgressFiles()); - textBytes.setText(progress.getFormattedCrawlingProgressBytes()); - } else if (!progress.isDoneLoading()) { - label.setText("Loading data ..."); - textFiles.setText(progress.getFormattedLoadingProgressFiles() // - + " / " + progress.getFormattedCrawlingProgressFiles()); - textBytes.setText(progress.getFormattedLoadingProgressBytes() // - + " / " + progress.getFormattedCrawlingProgressBytes()); - } else if (!progress.isDoneIndexing()) { - label.setText("Indexing ..."); - textFiles.setText(progress.getFormattedLoadingProgressFiles()); - textBytes.setText(progress.getFormattedLoadingProgressBytes()); - } else { - label.setText("Enter a search query"); - textFiles.setText(progress.getFormattedLoadingProgressFiles()); - textBytes.setText(progress.getFormattedLoadingProgressBytes()); - } - } - -} diff --git a/server/src/main/java/sk/linhard/exactly/gui/MainMenuBar.java b/server/src/main/java/sk/linhard/exactly/gui/MainMenuBar.java deleted file mode 100644 index 5163f54..0000000 --- a/server/src/main/java/sk/linhard/exactly/gui/MainMenuBar.java +++ /dev/null @@ -1,29 +0,0 @@ -package sk.linhard.exactly.gui; - -import java.awt.event.ActionEvent; -import java.awt.event.ActionListener; - -import javax.swing.JMenu; -import javax.swing.JMenuBar; -import javax.swing.JMenuItem; - -public class MainMenuBar extends JMenuBar { - - public MainMenuBar(Core core) { - JMenu mnFile = new JMenu("File"); - add(mnFile); - - JMenuItem mnIndex = new JMenuItem("Index folder ..."); - mnFile.add(mnIndex); - - mnIndex.addActionListener(new ActionListener() { - - @Override - public void actionPerformed(ActionEvent e) { - core.onMenuActionIndexFolder(); - } - }); - - } - -} diff --git a/server/src/main/java/sk/linhard/exactly/gui/MainWindow.java b/server/src/main/java/sk/linhard/exactly/gui/MainWindow.java deleted file mode 100644 index 10d5c66..0000000 --- a/server/src/main/java/sk/linhard/exactly/gui/MainWindow.java +++ /dev/null @@ -1,121 +0,0 @@ -package sk.linhard.exactly.gui; - -import java.awt.BorderLayout; -import java.awt.Dimension; -import java.awt.event.KeyEvent; -import java.awt.event.KeyListener; -import java.io.File; - -import javax.swing.JFrame; -import javax.swing.JLabel; -import javax.swing.JPanel; -import javax.swing.JTextField; -import javax.swing.SwingConstants; -import javax.swing.border.EmptyBorder; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import sk.linhard.exactly.impl.IndexingProgressReporter; - -public class MainWindow extends JFrame { - - private static final Logger log = LoggerFactory.getLogger(MainWindow.class); - private JTextField textField; - private EntryPanel entryPanel; - private ContentPanel contentPanel; - private JPanel searchPanel; - private Core core; - private String lastSearch; - - public MainWindow(Core core) { - this.core = core; - setTitle("Search"); - setSize(960, 480); - setLocationRelativeTo(null); - setDefaultCloseOperation(EXIT_ON_CLOSE); - - MainMenuBar mainMenu = new MainMenuBar(this.core); - setJMenuBar(mainMenu); - getContentPane().setLayout(new BorderLayout(0, 0)); - - searchPanel = new JPanel(); - getContentPane().add(searchPanel); - searchPanel.setLayout(new BorderLayout(0, 0)); - - JPanel panSearchBar = new JPanel(); - panSearchBar.setBorder(new EmptyBorder(2, 2, 2, 2)); - panSearchBar.setPreferredSize(new Dimension(0, 40)); - searchPanel.add(panSearchBar, BorderLayout.NORTH); - - JLabel lblSearchQuery = new JLabel("Search query:"); - lblSearchQuery.setVerticalAlignment(SwingConstants.CENTER); - lblSearchQuery.setBorder(new EmptyBorder(2, 2, 2, 2)); - lblSearchQuery.setSize(100, 50); - panSearchBar.add(lblSearchQuery); - - textField = new JTextField(); - textField.setEditable(false); - - textField.addKeyListener(new KeyListener() { - - @Override - public void keyTyped(KeyEvent e) { - - } - - @Override - public void keyReleased(KeyEvent e) { - String newSearch = textField.getText(); - if (lastSearch == null || !lastSearch.equals(newSearch)) { - if (newSearch != null && !newSearch.isEmpty()) { - lastSearch = newSearch; - log.debug("Query updated: {}", lastSearch); - entryPanel.setSearchResult(core.find(lastSearch)); - } - } - } - - @Override - public void keyPressed(KeyEvent e) { - - } - }); - - panSearchBar.add(textField); - textField.setColumns(10); - - entryPanel = new EntryPanel(core); - contentPanel = new ContentPanel(core); - - searchPanel.add(entryPanel, BorderLayout.CENTER); - - } - - public void exitHit() { - getContentPane().remove(contentPanel); - getContentPane().add(searchPanel); - JPanel p = (JPanel) getContentPane(); - p.updateUI(); - } - - public void reportRoot(File file) { - textField.setText(""); - textField.setEditable(false); - entryPanel.reportRoot(file); - } - - public void reportProgress(IndexingProgressReporter.IndexingProgress progress) { - textField.setEditable(progress.isDoneIndexing()); - entryPanel.reportProgress(progress); - } - - public void selectItem(SearchResultItem selectedItem) { - getContentPane().remove(searchPanel); - contentPanel.setContent(selectedItem); - getContentPane().add(contentPanel); - JPanel p = (JPanel) getContentPane(); - p.updateUI(); - } - -} diff --git a/server/src/main/java/sk/linhard/exactly/gui/SearchResultItem.java b/server/src/main/java/sk/linhard/exactly/gui/SearchResultItem.java deleted file mode 100644 index 97b1a71..0000000 --- a/server/src/main/java/sk/linhard/exactly/gui/SearchResultItem.java +++ /dev/null @@ -1,51 +0,0 @@ -package sk.linhard.exactly.gui; - -import java.util.ArrayList; -import java.util.List; - -import org.apache.commons.lang3.StringUtils; - -import sk.linhard.exactly.Hit; -import sk.linhard.exactly.HitContext; -import sk.linhard.exactly.SearchResult; - -public class SearchResultItem { - - private SearchResult sr; - private Hit hit; - - public SearchResultItem(SearchResult sr, Hit hit) { - this.sr = sr; - this.hit = hit; - } - - public static List toItems(int maxLen, SearchResult sr) { - int listLen = Math.min(sr.size(), maxLen); - List r = new ArrayList<>(listLen); - for (int i = 0; i < listLen; i++) { - r.add(new SearchResultItem(sr, sr.hit(i))); - } - return r; - } - - public String file() { - return hit.document().id(); - } - - public HitContext charContext(int maxCtx) { - return hit.charContext(maxCtx, maxCtx); - } - - public HitContext lineContext(int maxLines) { - return hit.lineContext(maxLines, maxLines); - } - - public String matchLine(int maxCtx) { - HitContext ctx = hit.safeCharContext(maxCtx, maxCtx); - return StringUtils.leftPad(ctx.before(), maxCtx) + ctx.pattern() + StringUtils.rightPad(ctx.after(), maxCtx); - } - - public int patternLength() { - return sr.patternLength(); - } -} diff --git a/server/src/main/java/sk/linhard/exactly/impl/DefaultHit.java b/server/src/main/java/sk/linhard/exactly/impl/DefaultHit.java deleted file mode 100644 index d248356..0000000 --- a/server/src/main/java/sk/linhard/exactly/impl/DefaultHit.java +++ /dev/null @@ -1,53 +0,0 @@ -package sk.linhard.exactly.impl; - -import sk.linhard.exactly.Document; -import sk.linhard.exactly.Hit; -import sk.linhard.exactly.HitContext; -import sk.linhard.exactly.impl.DefaultSearch.DefaultSearchResult; - -class DefaultHit implements Hit { - - private final DefaultSearchResult searchResult; - private final int hitIdx; - - public DefaultHit(DefaultSearchResult defaultSearchResult, int hitIdx) { - searchResult = defaultSearchResult; - this.hitIdx = hitIdx; - } - - @Override - public String toString() { - return "[hitIdx=" + hitIdx + ", doc=" + document().id() + ", pos=" + position() + "]"; - } - - @Override - public int globalPosition() { - return searchResult.globalPosition(hitIdx); - } - - @Override - public int position() { - return searchResult.position(hitIdx); - } - - @Override - public Document document() { - return searchResult.document(hitIdx); - } - - @Override - public HitContext charContext(int charsBefore, int charsAfter) { - return searchResult.charContext(hitIdx, charsBefore, charsAfter); - } - - @Override - public HitContext safeCharContext(int charsBefore, int charsAfter) { - return searchResult.safeCharContext(hitIdx, charsBefore, charsAfter); - } - - @Override - public HitContext lineContext(int linesBefore, int linesAfter) { - return searchResult.lineContext(hitIdx, linesBefore, linesAfter); - } - -} \ No newline at end of file diff --git a/server/src/main/java/sk/linhard/exactly/impl/DefaultHitContext.java b/server/src/main/java/sk/linhard/exactly/impl/DefaultHitContext.java deleted file mode 100644 index 4cefdd5..0000000 --- a/server/src/main/java/sk/linhard/exactly/impl/DefaultHitContext.java +++ /dev/null @@ -1,46 +0,0 @@ -package sk.linhard.exactly.impl; - -import sk.linhard.exactly.HitContext; - -class DefaultHitContext implements HitContext { - - private DefaultSearch search; - private int ctxPosition; - private int beforeLength; - private int patternLength; - private int afterLength; - - DefaultHitContext(DefaultSearch search, int ctxPosition, int beforeLength, int patternLength, int afterLength) { - this.search = search; - this.ctxPosition = ctxPosition; - this.beforeLength = beforeLength; - this.patternLength = patternLength; - this.afterLength = afterLength; - } - - @Override - public byte[] before() { - return search.dataCopy(ctxPosition, beforeLength); - } - - @Override - public byte[] pattern() { - return search.dataCopy(ctxPosition + beforeLength, patternLength); - } - - @Override - public byte[] after() { - return search.dataCopy(ctxPosition + beforeLength + patternLength, afterLength); - } - - @Override - public int highlightStart() { - return beforeLength; - } - - @Override - public int highlightEnd() { - return beforeLength + patternLength; - } - -} diff --git a/server/src/main/java/sk/linhard/exactly/impl/DefaultSearch.java b/server/src/main/java/sk/linhard/exactly/impl/DefaultSearch.java deleted file mode 100644 index 8cdfa9e..0000000 --- a/server/src/main/java/sk/linhard/exactly/impl/DefaultSearch.java +++ /dev/null @@ -1,794 +0,0 @@ -package sk.linhard.exactly.impl; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.Iterator; -import java.util.List; -import java.util.Stack; -import java.util.function.Consumer; - -import org.apache.commons.lang.StringUtils; -import org.apache.commons.lang3.ArrayUtils; -import org.apache.commons.lang3.tuple.Pair; - -import sk.linhard.exactly.Document; -import sk.linhard.exactly.Hit; -import sk.linhard.exactly.HitContext; -import sk.linhard.exactly.Search; -import sk.linhard.exactly.SearchResult; - -/** - * Default search implementation that works with single document. - */ -public class DefaultSearch implements Search { - - protected final static int UNDEF = EnhancedSuffixArray.UNDEF; - protected final byte[] data; - protected final int[] SA; - private final int[] lcp; - private final int[] up; - private final int[] down; - private final int[] next; - protected final Interval rootInterval; - private final String documentId; - - public static DefaultSearch compute(Document document) { - EnhancedSuffixArray esa = new EnhancedSuffixArray(document.content()); - esa.computeLCP(); - esa.computeUpDown(); - esa.computeNext(); - return new DefaultSearch(document.id(), esa); - } - - protected DefaultSearch(String documentId, EnhancedSuffixArray esa) { - this.data = esa.data; - this.SA = esa.SA; - this.lcp = esa.lcp; - this.up = esa.up; - this.down = esa.down; - this.next = esa.next; - this.rootInterval = new Interval(0, 0, SA.length - 1); - this.documentId = documentId; - } - - @Override - public int documentCount() { - return 1; - } - - @Override - public Document document(int documentIndex) { - if (documentIndex < 0) { - throw new ArrayIndexOutOfBoundsException("Negative index"); - } - if (documentIndex > 0) { - throw new ArrayIndexOutOfBoundsException( - "This is a single document search. Index " + documentIndex + " is not valid"); - } - return new Document() { - - @Override - public String id() { - return documentId; - } - - @Override - public byte[] content() { - return data; - } - - @Override - public int index() { - return documentIndex; - } - - }; - } - - private void acceptInterval(Interval parent, int childStart, int childEnd, Consumer func) { - childEnd = childEnd == UNDEF ? parent.end : childEnd; - if (childStart + 1 < childEnd) { - func.accept(interval(childStart, childEnd)); - } else if (childStart != childEnd) { - func.accept(new Interval(parent.len, childStart, childEnd)); - } - } - - private Interval createInterval(Interval parent, int childStart, int childEnd) { - childEnd = childEnd == UNDEF ? parent.end : childEnd; - if (childStart + 1 < childEnd) { - return interval(childStart, childEnd); - } else if (childStart != childEnd) { - return new Interval(parent.len, childStart, childEnd); - } else { - return null; - } - } - - private String substr(int i) { - return substr(i, data.length); - } - - private String substr(int start, int end) { - if (start < 0) { - start = 0; - } - if (end > data.length) { - end = data.length; - } - byte[] buf = new byte[end - start]; - System.arraycopy(data, start, buf, 0, buf.length); - return new String(buf); - } - - String print() { - StringBuffer s = new StringBuffer(); - s.append(StringUtils.leftPad("i", 6)); - s.append(StringUtils.leftPad("SA[i]", 6)); - s.append(StringUtils.leftPad("lcp[i]", 7)); - s.append(StringUtils.leftPad("up[i]", 6)); - s.append(StringUtils.leftPad("down[i]", 8)); - s.append(StringUtils.leftPad("next[i]", 8)); - s.append(" suffix[SA[i]]\n"); - - int n = SA.length; - - for (int i = 0; i < n; i++) { - s.append(StringUtils.leftPad(Integer.toString(i), 6)); - s.append(StringUtils.leftPad(Integer.toString(SA[i]), 6)); - s.append(StringUtils.leftPad(Integer.toString(lcp[i]), 7)); - s.append(StringUtils.leftPad(Integer.toString(up[i]), 6)); - s.append(StringUtils.leftPad(Integer.toString(down[i]), 8)); - s.append(StringUtils.leftPad(Integer.toString(next[i]), 8)); - s.append(" "); - s.append(substr(SA[i])); - s.append("\n"); - } - - return s.toString(); - } - - private int firstLIndex(Interval parent) { - if (parent == rootInterval) { - return 0; - } else { - int cup = up[parent.end]; - if (cup < parent.end && parent.start < cup) { - return cup; - } else { - return down[parent.start]; - } - } - } - - protected Byte edgeChar(Interval parent, Interval child) { - int pos = SA[child.start] + parent.len; - return pos >= data.length ? null : data[pos]; - } - - private Interval getInterval(Interval parent, Byte c) { - for (Interval child : children(parent)) { - if (c.equals(edgeChar(parent, child))) { - return child; - } - } - return null; - } - - protected Iterable children(Interval parent) { - return new Iterable() { - - @Override - public Iterator iterator() { - return new IntervalIterator(parent); - } - }; - } - - protected void forEachChild(Interval parent, Consumer func) { - int i = parent.start; - int nexti = firstLIndex(parent); - if (nexti == i) { - nexti = next[i]; - } - acceptInterval(parent, i, nexti, func); - while (nexti != UNDEF) { - i = nexti; - nexti = next[i]; - acceptInterval(parent, i, nexti, func); - } - } - - protected boolean match(byte[] pattern, int dataOff, int patternOff, int len) { - for (int i = 0; i < len; i++) { - int pIdx = patternOff + i; - int dIdx = dataOff + i; - if (pIdx >= pattern.length || dIdx >= data.length || pattern[pIdx] != data[dIdx]) { - return false; - } - } - return true; - } - - @Override - public SearchResult find(byte[] pattern) { - if (pattern == null || pattern.length == 0) { - throw new RuntimeException("You must specify a non-empty pattern"); - } - int c = 0; - boolean queryFound = true; - Interval intv = getInterval(rootInterval, pattern[c]); - int intvLen = 0; - while (intv != null && c < pattern.length && queryFound) { - intvLen = intv.end - intv.start; - if (intvLen > 1) { - int min = Math.min(intv.len, pattern.length); - queryFound = match(pattern, SA[intv.start] + c, c, min - c); - c = min; - if (c < pattern.length) { - intv = getInterval(intv, pattern[c]); - } - } else { - queryFound = match(pattern, SA[intv.start] + c, c, pattern.length - c); - break; - } - } - if (intv != null && queryFound) { - return createSearchResult(intv.start, intvLen, pattern.length); - } else { - return new EmptyResultImpl(pattern); - } - } - - protected SearchResult createSearchResult(int saIntervalStart, int saIntervalLength, int patternLength) { - return new DefaultSearchResult(saIntervalStart, saIntervalLength, patternLength); - } - - private Interval interval(int i, int j) { - int cup = up[j]; - if (cup < j && i < cup) { - return new Interval(lcp[cup], i, j); - } else { - return new Interval(lcp[down[i]], i, j); - } - } - - public int getLCPOverflow1() { - int cnt = 0; - for (int i = 0; i < lcp.length; i++) { - if (lcp[i] > 254) { - cnt++; - } - } - return cnt; - } - - public int getLCPOverflow2() { - int max = 256 * 256 - 2; - int cnt = 0; - for (int i = 0; i < lcp.length; i++) { - if (lcp[i] > max) { - cnt++; - } - } - return cnt; - } - - byte[] dataCopy(int globalPosition, int length) { - byte[] buf = new byte[length]; - System.arraycopy(data, globalPosition, buf, 0, buf.length); - return buf; - } - - /** - * This is basically "shortest non-substring" problem - */ - byte[] findSeparator() { - Stack> intervalStack = new Stack<>(); - boolean[] occurenceBuf = new boolean[256]; - intervalStack.push(Pair.of(0, rootInterval)); - - while (!intervalStack.isEmpty()) { - Pair t = intervalStack.pop(); - Integer sepLen = t.getLeft(); - Interval interval = t.getRight(); - Byte nonExistentChar = findNonExistentChar(interval, sepLen, occurenceBuf); - if (nonExistentChar != null) { - return buildSeparator(interval.start, sepLen, nonExistentChar); - } else { - forEachChild(interval, child -> { - intervalStack.push(Pair.of(sepLen + 1, child)); - }); - } - } - - throw new IllegalStateException("Separator must be found"); - } - - private byte[] buildSeparator(int saIdx, int sepLen, byte tail) { - byte[] separator = new byte[sepLen + 1]; - System.arraycopy(data, SA[saIdx], separator, 0, sepLen); - separator[sepLen] = tail; - return separator; - } - - private Byte findNonExistentChar(Interval parent, int sepLen, boolean[] occurence) { - Arrays.fill(occurence, false); - forEachChild(parent, child -> { - Byte edgeStart = SA[child.start] + sepLen >= data.length ? null : data[SA[child.start] + sepLen]; - if (edgeStart != null) { - int occurenceIdx = edgeStart; - occurenceIdx -= Byte.MIN_VALUE; - occurence[occurenceIdx] = true; - } - }); - for (int i = 0; i < occurence.length; i++) { - if (!occurence[i]) { - return (byte) (i + Byte.MIN_VALUE); - } - } - return null; - } - - String printSA(int start, int end) { - StringBuilder s = new StringBuilder(); - s.append(StringUtils.leftPad("i", 12)); - s.append(StringUtils.leftPad("SA[i]", 12)); - s.append(StringUtils.leftPad("lcp[i]", 12)); - s.append(StringUtils.leftPad("up[i]", 12)); - s.append(StringUtils.leftPad("down[i]", 12)); - s.append(StringUtils.leftPad("next[i]", 12)); - s.append(" suffix start"); - s.append("\n"); - for (int i = start; i < end; i++) { - s.append(StringUtils.leftPad(Integer.toString(i), 12)); - s.append(StringUtils.leftPad(Integer.toString(SA[i]), 12)); - s.append(StringUtils.leftPad(Integer.toString(lcp[i]), 12)); - s.append(StringUtils.leftPad(Integer.toString(up[i]), 12)); - s.append(StringUtils.leftPad(Integer.toString(down[i]), 12)); - s.append(StringUtils.leftPad(Integer.toString(next[i]), 12)); - s.append(" "); - s.append(printSuffix(SA[i], 10)); - s.append("\n"); - } - return s.toString(); - } - - String printSuffix(int pos, int len) { - byte[] cp = Arrays.copyOfRange(data, pos, pos + len); - Byte[] cpo = ArrayUtils.toObject(cp); - return Arrays.asList(cpo).toString(); - } - - String printArray(int[] a, int start, int end) { - StringBuilder s = new StringBuilder(); - s.append(StringUtils.leftPad("i", 12)); - s.append(StringUtils.leftPad("a[i]", 12)); - s.append("\n"); - for (int i = 0; i < end - start; i++) { - s.append(StringUtils.leftPad(Integer.toString(i), 12)); - s.append(StringUtils.leftPad(Integer.toString(a[i]), 12)); - s.append("\n"); - } - return s.toString(); - } - - String printArray(byte[] a, int start, int end) { - StringBuilder s = new StringBuilder(); - s.append(StringUtils.leftPad("i", 12)); - s.append(StringUtils.leftPad("a[i]", 12)); - s.append("\n"); - for (int i = 0; i < end - start; i++) { - s.append(StringUtils.leftPad(Integer.toString(i), 12)); - s.append(StringUtils.leftPad(Integer.toString(a[i]), 12)); - s.append("\n"); - } - return s.toString(); - } - - int isNewLine(int i) { - return isNewLine(i, this.data); - } - - /** - * recognizes following newline sequences: [13], [10], [13, 10] - * - * @return 0 if newline not present, 1 newline of length 1 present, 2 - * newline of length 2 present at given index - */ - static int isNewLine(int i, byte[] data) { - if (i >= 0 && i < data.length) { - byte c0 = data[i]; - if (c0 == 13) { - return i == data.length - 1 || data[i + 1] != 10 ? 1 : 2; - } else if (c0 == 10) { - return i == 0 || data[i - 1] != 13 ? 1 : 0; - } else { - return 0; - } - } else { - return 0; - } - } - - String print(int saIntervalStart, int saIntervalLength, int patternLength, int context) { - StringBuffer s = new StringBuffer(); - s.append(StringUtils.leftPad("i", 6)); - s.append(StringUtils.leftPad("pos", 6)); - s.append(StringUtils.leftPad("doc", 6)); - s.append(" match\n"); - - for (int i = 0; i < saIntervalLength; i++) { - s.append(StringUtils.leftPad(Integer.toString(i), 6)); - int posi = SA[saIntervalStart + i]; - s.append(StringUtils.leftPad(Integer.toString(posi), 6)); - s.append(StringUtils.leftPad(document(i).id(), 6)); - s.append(" "); - s.append(substr(posi - context, posi)); - s.append(">"); - s.append(substr(posi, posi + patternLength)); - s.append("<"); - s.append(substr(posi + patternLength, posi + patternLength + context)); - s.append("\n"); - } - - return s.toString(); - } - - int dataLength() { - return data.length; - } - - protected static class Interval { - public int len; - public int start; - public int end; - - public Interval(int len, int start, int end) { - this.len = len; - this.start = start; - this.end = end; - } - - @Override - public String toString() { - return Integer.toString(len) + "-[" + start + ", " + end + "]"; - } - - @Override - public int hashCode() { - final int prime = 31; - int result = 1; - result = prime * result + end; - result = prime * result + len; - result = prime * result + start; - return result; - } - - @Override - public boolean equals(Object obj) { - if (this == obj) - return true; - if (obj == null) - return false; - if (getClass() != obj.getClass()) - return false; - Interval other = (Interval) obj; - if (end != other.end) - return false; - if (len != other.len) - return false; - if (start != other.start) - return false; - return true; - } - - } - - private class IntervalIterator implements Iterator { - private Interval parent; - private int start; - private int end; - private Interval nextInterval; - - public IntervalIterator(Interval parent) { - this.parent = parent; - this.start = parent.start; - this.end = firstLIndex(parent); - if (end == start) { - end = next[start]; - } - this.nextInterval = createInterval(parent, start, end); - } - - @Override - public boolean hasNext() { - return nextInterval != null; - } - - @Override - public Interval next() { - Interval r = nextInterval; - if (end != UNDEF) { - start = end; - end = next[start]; - this.nextInterval = createInterval(parent, start, end); - } else { - this.nextInterval = null; - } - return r; - } - - } - - class EmptyResultImpl implements SearchResult { - private byte[] pattern; - - public EmptyResultImpl(byte[] pattern) { - this.pattern = pattern; - } - - @Override - public boolean isEmpty() { - return true; - } - - @Override - public int size() { - return 0; - } - - @Override - public DefaultHit hit(int i) { - throw new IndexOutOfBoundsException("Result is empty"); - } - - @Override - public int patternLength() { - return pattern.length; - } - - @Override - public byte[] pattern() { - return pattern; - } - - @Override - public boolean hasGlobalPosition(int position) { - return false; - } - - @Override - public DefaultHit hitWithGlobalPosition(int position) { - return null; - } - - @Override - public boolean hasPosition(int document, int position) { - return false; - } - - @Override - public DefaultHit hitWithPosition(int document, int position) { - return null; - } - - @Override - public List> hits() { - return Collections.emptyList(); - } - - @Override - public Iterator> iterator() { - return Collections.emptyIterator(); - } - - @Override - public Iterable> skipIterator(int offset) { - return new Iterable>() { - - @Override - public Iterator> iterator() { - return Collections.emptyIterator(); - } - }; - } - } - - class DefaultSearchResult implements SearchResult { - - private final int saIntervalStart; - private final int saIntervalLength; - private final int patternLength; - - public DefaultSearchResult(int saIntervalStart, int saIntervalLength, int patternLength) { - this.saIntervalStart = saIntervalStart; - this.saIntervalLength = saIntervalLength; - this.patternLength = patternLength; - } - - @Override - public boolean isEmpty() { - return false; - } - - @Override - public int size() { - return saIntervalLength; - } - - protected int globalPosition(int hitIdx) { - if (hitIdx < 0 || hitIdx >= saIntervalLength) { - throw new IndexOutOfBoundsException( - "The hit index " + hitIdx + " exceeds the search result size " + saIntervalLength); - } - return SA[saIntervalStart + hitIdx]; - } - - protected int position(int hitIdx) { - return globalPosition(hitIdx); - } - - protected Document document(int hitIdx) { - return DefaultSearch.this.document(documentIndex(hitIdx)); - } - - protected int documentIndex(int hitIdx) { - return 0; - } - - @Override - public boolean hasGlobalPosition(int position) { - return hitWithGlobalPosition(position) != null; - } - - @Override - public DefaultHit hitWithGlobalPosition(int position) { - int saIntervalEnd = saIntervalStart + saIntervalLength; - for (int i = saIntervalStart; i < saIntervalEnd; i++) { - if (SA[i] == position) { - return new DefaultHit(this, i - saIntervalStart); - } - } - return null; - } - - @Override - public boolean hasPosition(int document, int position) { - return hitWithPosition(document, position) != null; - } - - @Override - public DefaultHit hitWithPosition(int document, int position) { - return hitWithGlobalPosition(position); - } - - @Override - public int patternLength() { - return patternLength; - } - - @Override - public byte[] pattern() { - return dataCopy(globalPosition(0), patternLength); - } - - protected int checkBefore(int pos, int maxSize) { - return Math.max(pos - maxSize, 0); - } - - protected int checkAfter(int pos, int maxSize) { - return Math.min(pos + maxSize, dataLength()); - } - - protected int linesBeforeStart(int i, int maxLines) { - int j = globalPosition(i); - int newLine = 0; - int lineCount = 0; - while (j >= 0 && lineCount <= maxLines) { - newLine = isNewLine(j); - if (newLine > 0) { - lineCount++; - } - j--; - } - return j + 1 + newLine; - } - - protected int linesAfterStart(int i, int maxLines) { - int j = globalPosition(i) + patternLength; - int lineCount = 0; - int dataLength = dataLength(); - while (j < dataLength && lineCount <= maxLines) { - if (isNewLine(j) > 0) { - lineCount++; - } - j++; - } - return j == dataLength ? j : j - 1; - } - - HitContext charContext(int idx, int charsBefore, int charsAfter) { - if (charsBefore < 0 || charsAfter < 0) { - throw new IllegalArgumentException("Negative context length"); - } - int pos = globalPosition(idx); - int beforeStart = checkBefore(pos, charsBefore); - int afterEnd = checkAfter(pos + patternLength, charsAfter); - return new DefaultHitContext(DefaultSearch.this, beforeStart, pos - beforeStart, patternLength, - afterEnd - pos - patternLength); - } - - HitContext safeCharContext(int idx, int charsBefore, int charsAfter) { - int pos = globalPosition(idx); - int beforeStart = checkBefore(pos, charsBefore); - int afterEnd = checkAfter(pos, charsAfter); - return new SafeHitContext(DefaultSearch.this, beforeStart, pos - beforeStart, patternLength, - afterEnd - pos - patternLength); - } - - HitContext lineContext(int idx, int linesBefore, int linesAfter) { - if (linesBefore < 0 || linesAfter < 0) { - throw new IllegalArgumentException("Negative context length"); - } - int patternStart = globalPosition(idx); - int beforeStart = linesBeforeStart(idx, linesBefore); - int afterEnd = linesAfterStart(idx, linesAfter); - return new DefaultHitContext(DefaultSearch.this, beforeStart, patternStart - beforeStart, patternLength, - afterEnd - patternStart - patternLength); - } - - @Override - public DefaultHit hit(int idx) { - return new DefaultHit(this, idx); - } - - @Override - public List> hits() { - int n = size(); - List> r = new ArrayList<>(n); - for (int i = 0; i < n; i++) { - r.add(new DefaultHit(this, i)); - } - return r; - } - - @Override - public Iterator> iterator() { - return new HitIterator(0); - } - - @Override - public Iterable> skipIterator(int offset) { - return new Iterable>() { - - @Override - public Iterator> iterator() { - return new HitIterator(offset); - } - }; - } - - class HitIterator implements Iterator> { - - private int idx; - - public HitIterator(int offset) { - this.idx = offset - 1; - } - - @Override - public boolean hasNext() { - return idx + 1 < DefaultSearchResult.this.size(); - } - - @Override - public Hit next() { - return hit(++idx); - } - - } - - } -} diff --git a/server/src/main/java/sk/linhard/exactly/impl/EnhancedSuffixArray.java b/server/src/main/java/sk/linhard/exactly/impl/EnhancedSuffixArray.java deleted file mode 100644 index 105d3f8..0000000 --- a/server/src/main/java/sk/linhard/exactly/impl/EnhancedSuffixArray.java +++ /dev/null @@ -1,140 +0,0 @@ -package sk.linhard.exactly.impl; - -import static sk.linhard.exactly.impl.sais.suffixsort; - -import java.util.Arrays; -import java.util.Stack; - -/** - * Creates an extended suffix array based on algorithms described in "Replacing - * suffix trees with enhanced suffix arrays" by Abouelhoda, Kurtz Ohlebusch - * - * The suffix array is constructed using SA-IS algorithm implementation by Yuta - * Mori. - * - */ -class EnhancedSuffixArray { - - protected final static int UNDEF = -1; - - public byte[] data; - public int[] SA; - public int[] lcp; - public int[] rank; // byproduct of lcp computation - public int[] up; - public int[] down; - public int[] next; - - EnhancedSuffixArray(byte[] data) { - this.data = data; - this.SA = new int[data.length + 1]; - suffixsort(data, this.SA, data.length); - } - - void computeLCP() { - computeLCP(false); - } - - void computeLCP(boolean keepRank) { - int start = 0; - int length = data.length; - this.rank = new int[length]; - for (int i = 0; i < length; i++) - rank[SA[i]] = i; - int h = 0; - this.lcp = new int[length + 1]; - for (int i = 0; i < length; i++) { - int k = rank[i]; - if (k == 0) { - lcp[k] = -1; - } else { - final int j = SA[k - 1]; - while (i + h < length && j + h < length && data[start + i + h] == data[start + j + h]) { - h++; - } - lcp[k] = h; - } - if (h > 0) - h--; - } - lcp[0] = 0; - lcp[length] = 0; - if (!keepRank) { - rank = null; - } - } - - void computeUpDown() { - up = new int[lcp.length]; - down = new int[lcp.length]; - Arrays.fill(up, UNDEF); - Arrays.fill(down, UNDEF); - - int lastIndex = UNDEF; - Stack stack = new Stack<>(); - stack.push(0); - for (int i = 1; i < lcp.length; i++) { - while (lcp[i] < lcp[stack.peek()]) { - lastIndex = stack.pop(); - if (lcp[i] <= lcp[stack.peek()] && lcp[stack.peek()] != lcp[lastIndex]) { - down[stack.peek()] = lastIndex; - } - } - if (lastIndex != UNDEF) { - up[i] = lastIndex; - lastIndex = UNDEF; - } - stack.push(i); - } - } - - void computeNext() { - next = new int[lcp.length]; - Arrays.fill(next, UNDEF); - - Stack stack = new Stack<>(); - stack.push(0); - for (int i = 1; i < lcp.length; i++) { - while (lcp[i] < lcp[stack.peek()]) { - stack.pop(); - } - if (lcp[i] == lcp[stack.peek()]) { - int lastIndex = stack.pop(); - next[lastIndex] = i; - } - stack.push(i); - } - } - - void introduceSeparators(int[] offsets, byte[] separator) { - int separatorExtraSpace = (offsets.length - 1) * separator.length; - byte[] newData = new byte[data.length + separatorExtraSpace]; - int lastIdx = offsets.length - 1; - for (int i = 0; i < lastIdx; i++) { - int oldOffset = offsets[i]; - separatorExtraSpace = i * separator.length; - moveSegment(oldOffset, offsets[i + 1], separatorExtraSpace, newData); - offsets[i] = oldOffset + separatorExtraSpace; - } - int oldOffset = offsets[lastIdx]; - separatorExtraSpace = lastIdx * separator.length; - moveSegment(oldOffset, data.length, separatorExtraSpace, newData); - offsets[lastIdx] = oldOffset + separatorExtraSpace; - - for (int i = 0; i < separator.length; i++) { - byte sepChar = separator[i]; - for (int j = 1; j < offsets.length; j++) { - newData[offsets[j] - separator.length + i] = sepChar; - } - } - - data = newData; - } - - void moveSegment(int start, int end, int separatorExtraSpace, byte[] newData) { - System.arraycopy(data, start, newData, start + separatorExtraSpace, end - start); - for (int j = start; j < end; j++) { - SA[rank[j]] += separatorExtraSpace; - } - } -} diff --git a/server/src/main/java/sk/linhard/exactly/impl/FileLoader.java b/server/src/main/java/sk/linhard/exactly/impl/FileLoader.java deleted file mode 100644 index 752296b..0000000 --- a/server/src/main/java/sk/linhard/exactly/impl/FileLoader.java +++ /dev/null @@ -1,136 +0,0 @@ -package sk.linhard.exactly.impl; - -import static java.util.stream.Collectors.toList; - -import java.io.File; -import java.io.FileFilter; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; - -import org.apache.commons.io.FileUtils; - -import sk.linhard.exactly.Search; -import sk.linhard.exactly.StringSearchBuilder; - -public class FileLoader { - - private List roots; - private List items; - private FileFilter fileFilter; - private StringSearchBuilder searchBuilder; - private IndexingProgressReporter reporter; - - public FileLoader(List roots, IndexingProgressReporter reporter) { - this.roots = roots; - this.fileFilter = f -> true; - this.reporter = reporter; - } - - public List getRoots() { - return roots; - } - - private List crawlInternal() { - if (roots == null || roots.isEmpty()) { - return Collections.emptyList(); - } - List collector = new ArrayList<>(); - for (File file : roots) { - if (file.isDirectory()) { - collector.addAll(crawlDirectory(file)); - } else { - collector.add(itemFor(file)); - } - } - return collector; - } - - private Item itemFor(File file) { - Item item = new Item(file, FileUtils.sizeOf(file)); - reporter.discovered(file, item.size); - return item; - } - - private List crawlDirectory(File dir) { - if (dir == null) { - return Collections.emptyList(); - } - List collector = new ArrayList<>(); - for (File file : dir.listFiles()) { - if (file.isDirectory()) { - collector.addAll(crawlDirectory(file)); - } else if (fileFilter.accept(file)) { - collector.add(itemFor(file)); - } - } - return collector; - } - - public void crawl() { - items = crawlInternal(); - reporter.doneCrawling(); - } - - public int getFileCount() { - return items.size(); - } - - public int getTotalSize() { - return searchBuilder.totalLength(); - } - - public void load() { - if (items == null || !reporter.isDoneCrawling()) { - throw new IllegalStateException("Must crawl the filesystem first"); - } - reporter.startLoading(); - searchBuilder = new StringSearchBuilder(); - for (int i = 0; i < items.size(); i++) { - Item r = items.get(i); - searchBuilder.add(r.file.getAbsolutePath(), r.file, (int) r.size); - reporter.added(r.file, r.size); - } - reporter.doneLoading(); - } - - public Search index() { - if (searchBuilder == null || !reporter.isDoneLoading()) { - throw new IllegalStateException("Must load the data first"); - } - - reporter.startIndexing(); - Search search = searchBuilder.build(); - reporter.doneIndexing(); - - return search; - } - - public Search indexBinary() { - if (searchBuilder == null || !reporter.isDoneLoading()) { - throw new IllegalStateException("Must load the data first"); - } - - reporter.startIndexing(); - Search search = searchBuilder.buildBinary(); - reporter.doneIndexing(); - - return search; - } - - public List fileList() { - return items.stream().map(i -> i.file).collect(toList()); - } - - private static class Item { - public File file; - public long size; - - public Item(File file, long fileSize) { - this.file = file; - this.size = fileSize; - } - - } - -} diff --git a/server/src/main/java/sk/linhard/exactly/impl/HitLineContext.java b/server/src/main/java/sk/linhard/exactly/impl/HitLineContext.java deleted file mode 100644 index b9e0e30..0000000 --- a/server/src/main/java/sk/linhard/exactly/impl/HitLineContext.java +++ /dev/null @@ -1,9 +0,0 @@ -package sk.linhard.exactly.impl; - -class HitLineContext { - - public byte[] linesBefore; - public byte[] linesAfter; - public int highlightStart; - public int highlightEnd; -} diff --git a/server/src/main/java/sk/linhard/exactly/impl/IndexingProgressReporter.java b/server/src/main/java/sk/linhard/exactly/impl/IndexingProgressReporter.java deleted file mode 100644 index 6f213ae..0000000 --- a/server/src/main/java/sk/linhard/exactly/impl/IndexingProgressReporter.java +++ /dev/null @@ -1,152 +0,0 @@ -package sk.linhard.exactly.impl; - -import java.io.File; -import java.text.DecimalFormat; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public class IndexingProgressReporter { - private static final Logger log = LoggerFactory.getLogger(FileLoader.class); - private static final DecimalFormat NFMT = new DecimalFormat("#,###,###"); - - private volatile boolean doneCrawling = false; - private volatile boolean doneLoading = false; - private volatile boolean doneIndexing = false; - private volatile long crawlingProgressBytes = 0l; - private volatile int crawlingProgressFiles = 0; - private volatile int loadingProgressBytes = 0; - private volatile int loadingProgressFiles = 0; - - public void discovered(File file, long size) { - log.debug("Found {} size {}", file.getAbsolutePath(), NFMT.format(size)); - crawlingProgressBytes += size; - } - - public void added(File file, long size) { - log.debug("Added {} size {}", file.getAbsolutePath(), NFMT.format(size)); - loadingProgressBytes += size; - loadingProgressFiles++; - } - - public void doneCrawling() { - if (crawlingProgressBytes > Integer.MAX_VALUE) { - throw new RuntimeException("Total data size cannot exceed 2 GB"); - } - log.debug("Found {} bytes in {} files", NFMT.format(crawlingProgressBytes), NFMT.format(crawlingProgressFiles)); - doneCrawling = true; - } - - public void doneLoading() { - log.debug("Loaded {} bytes from {} files", NFMT.format(loadingProgressBytes), - NFMT.format(loadingProgressFiles)); - doneLoading = true; - } - - public void doneIndexing() { - log.debug("Done indexing. Created suffix array"); - doneIndexing = true; - } - - public boolean isDoneCrawling() { - return doneCrawling; - } - - public boolean isDoneLoading() { - return doneLoading; - } - - public boolean isDoneIndexing() { - return doneIndexing; - } - - public void startLoading() { - log.debug("Loading {} bytes", NFMT.format(this.crawlingProgressBytes)); - } - - public void startIndexing() { - log.debug("Indexing {} bytes", NFMT.format(loadingProgressBytes)); - } - - public IndexingProgress getProgress() { - return new IndexingProgressReporter.IndexingProgress(// - doneCrawling, // - doneLoading, // - doneIndexing, // - crawlingProgressBytes, // - crawlingProgressFiles, // - loadingProgressBytes, // - loadingProgressFiles); - } - - public static class IndexingProgress { - private static final DecimalFormat NFMT = new DecimalFormat("#,###,###"); - - private final boolean doneCrawling; - private final boolean doneLoading; - private final boolean doneIndexing; - private final long crawlingProgressBytes; - private final int crawlingProgressFiles; - private final int loadingProgressBytes; - private final int loadingProgressFiles; - - public IndexingProgress(boolean doneCrawling, boolean doneLoading, boolean doneIndexing, - long crawlingProgressBytes, int crawlingProgressFiles, int loadingProgressBytes, - int loadingProgressFiles) { - super(); - this.doneCrawling = doneCrawling; - this.doneLoading = doneLoading; - this.doneIndexing = doneIndexing; - this.crawlingProgressBytes = crawlingProgressBytes; - this.crawlingProgressFiles = crawlingProgressFiles; - this.loadingProgressBytes = loadingProgressBytes; - this.loadingProgressFiles = loadingProgressFiles; - } - - public boolean isDoneCrawling() { - return doneCrawling; - } - - public boolean isDoneLoading() { - return doneLoading; - } - - public boolean isDoneIndexing() { - return doneIndexing; - } - - public long getCrawlingProgressBytes() { - return crawlingProgressBytes; - } - - public int getCrawlingProgressFiles() { - return crawlingProgressFiles; - } - - public int getLoadingProgressBytes() { - return loadingProgressBytes; - } - - public int getLoadingProgressFiles() { - return loadingProgressFiles; - } - - public String getFormattedCrawlingProgressBytes() { - return NFMT.format(crawlingProgressBytes); - } - - public String getFormattedCrawlingProgressFiles() { - return NFMT.format(crawlingProgressFiles); - } - - public String getFormattedLoadingProgressBytes() { - return NFMT.format(loadingProgressBytes); - } - - public String getFormattedLoadingProgressFiles() { - return NFMT.format(loadingProgressFiles); - } - - } - -} diff --git a/server/src/main/java/sk/linhard/exactly/impl/MultiDocumentSearch.java b/server/src/main/java/sk/linhard/exactly/impl/MultiDocumentSearch.java deleted file mode 100644 index 7866ccd..0000000 --- a/server/src/main/java/sk/linhard/exactly/impl/MultiDocumentSearch.java +++ /dev/null @@ -1,262 +0,0 @@ -package sk.linhard.exactly.impl; - -import java.util.Arrays; - -import sk.linhard.exactly.Document; -import sk.linhard.exactly.SearchResult; - -/** - * Search in data built from multiple documents. - * - * Optimisation proposals for separator: - it can be at most 5 bytes (it's not - * possible that the 2GB of data contains all 4byte values) - can be stored in - * one long - can be compared as long with bit operations - */ -public class MultiDocumentSearch extends DefaultSearch { - - private final int[] offsets; - private final String[] ids; - private final byte[] separator; - private final int newLineInSeparator; - - public static MultiDocumentSearch compute(byte[] data, int[] offsets, String[] ids) { - EnhancedSuffixArray esa = new EnhancedSuffixArray(data); - esa.computeLCP(true); - esa.computeUpDown(); - esa.computeNext(); - byte[] separator = new DefaultSearch(null, esa).findSeparator(); - esa.introduceSeparators(offsets, separator); - esa = new EnhancedSuffixArray(esa.data); - esa.computeLCP(); - esa.computeUpDown(); - esa.computeNext(); - return new MultiDocumentSearch(esa, offsets, ids, separator); - } - - public static MultiDocumentSearch compute(byte[] data, int[] offsets, String[] ids, byte[] separator) { - EnhancedSuffixArray esa = new EnhancedSuffixArray(data); - esa.computeLCP(true); - esa.computeUpDown(); - esa.computeNext(); - SearchResult sr = new DefaultSearch(null, esa).find(separator); - if (!sr.isEmpty()) { - throw new IllegalArgumentException("Given separator found at position " + sr.hit(0).globalPosition()); - } - esa.introduceSeparators(offsets, separator); - esa = new EnhancedSuffixArray(esa.data); - esa.computeLCP(); - esa.computeUpDown(); - esa.computeNext(); - return new MultiDocumentSearch(esa, offsets, ids, separator); - } - - public MultiDocumentSearch(EnhancedSuffixArray esa, int[] offsets, String[] ids, byte[] separator) { - super(null, esa); - this.offsets = offsets; - this.ids = ids; - this.separator = separator; - this.newLineInSeparator = newLineInSeparator(separator); - } - - private int newLineInSeparator(byte[] separator) { - for (int i = 0; i < separator.length; i++) { - if (isNewLine(i, separator) > 0) { - return i; - } - } - return -1; - } - - @Override - public int documentCount() { - return offsets.length; - } - - @Override - public Document document(int documentIdx) { - if (documentIdx < 0) { - throw new ArrayIndexOutOfBoundsException("Negative index"); - } - if (documentIdx >= offsets.length) { - throw new ArrayIndexOutOfBoundsException( - "This search contains " + offsets.length + " documents. Index " + documentIdx + " is not valid"); - } - - return new Document() { - - @Override - public int index() { - return documentIdx; - } - - @Override - public String id() { - return ids[documentIdx]; - } - - @Override - public byte[] content() { - return documentContent(documentIdx); - } - }; - } - - private byte[] documentContent(int documentIdx) { - if (documentIdx < 0 || documentIdx >= offsets.length) { - throw new ArrayIndexOutOfBoundsException(); - } - int start = offsets[documentIdx]; - int end = documentIdx == offsets.length - 1 ? data.length : offsets[documentIdx + 1] - separator.length; - return dataCopy(start, end - start); - } - - @Override - protected Byte edgeChar(Interval parent, Interval child) { - int pos = SA[child.start] + parent.len; - return pos < data.length && !separatorAt(pos) ? data[pos] : null; - } - - @Override - protected boolean match(byte[] pattern, int dataOff, int patternOff, int len) { - for (int i = 0; i < len; i++) { - int pIdx = patternOff + i; - int dIdx = dataOff + i; - if (pIdx >= pattern.length || dIdx >= data.length || pattern[pIdx] != data[dIdx] || separatorAt(dIdx)) { - return false; - } - } - return true; - } - - private boolean separatorAt(int pos) { - if (pos + separator.length <= data.length && pos >= 0) { - for (int i = 0; i < separator.length; i++) { - if (separator[i] != data[pos + i]) { - return false; - } - } - return true; - } else { - return false; - } - } - - @Override - protected SearchResult createSearchResult(int saIntervalStart, int saIntervalLength, int patternLength) { - return new MultiDocResultImpl(saIntervalStart, saIntervalLength, patternLength); - } - - class MultiDocResultImpl extends DefaultSearchResult { - - private int[] documentCache; - - public MultiDocResultImpl(int saIntervalStart, int saIntervalLength, int patternLength) { - super(saIntervalStart, saIntervalLength, patternLength); - documentCache = new int[saIntervalLength]; - Arrays.fill(documentCache, UNDEF); - } - - @Override - protected int documentIndex(int hitIdx) { - if (documentCache[hitIdx] == UNDEF) { - int pos = globalPosition(hitIdx); - int r = Arrays.binarySearch(offsets, pos); - documentCache[hitIdx] = r >= 0 ? r : -r - 2; - } - return documentCache[hitIdx]; - } - - @Override - protected int position(int hitIdx) { - return globalPosition(hitIdx) - offsets[documentIndex(hitIdx)]; - } - - protected int[] positions() { - int[] positions = new int[size()]; - for (int i = 0; i < positions.length; i++) { - positions[i] = position(i); - } - return positions; - } - - @Override - protected int checkBefore(int pos, int maxSize) { - int leftLimit = super.checkBefore(pos, maxSize); - for (int i = pos - separator.length; i >= leftLimit; i--) { - if (separatorAt(i)) { - return i + separator.length; - } - } - return leftLimit; - } - - @Override - protected int checkAfter(int pos, int maxSize) { - int rightLimit = super.checkAfter(pos, maxSize); - int sepRightLimit = rightLimit - separator.length; - for (int i = pos; i <= sepRightLimit; i++) { - if (separatorAt(i)) { - return i; - } - } - return rightLimit; - } - - @Override - public DefaultHit hitWithPosition(int document, int position) { - return hitWithGlobalPosition(offsets[document] + position); - } - - @Override - protected int linesBeforeStart(int i, int maxLines) { - int j = globalPosition(i); - int newLine = 0; - int lineCount = 0; - boolean sep = separatorAt(j); - while (j >= 0 && !sep && lineCount <= maxLines) { - newLine = isNewLine(j, data); - if (newLine > 0) { - lineCount++; - } - sep = separatorAt(--j); - } - /* - * if separator is contained in (newLineInSeparator == -1) or equal - * to (newLineInSeparator == 0) newline sequence this means that the - * newline sequence never appears in the data. That means that - * isNewLine always returns 0, lineCount never increases and - * therefore the loop is ended only by the separator. in both cases - * we want to return j + 1 + separator.length - * - * if newLine is contained (but not equal) in the separator - * (newLineInSeparator > 0) we want to return - * - */ - int newLineEnd = j + 1 + newLine; - if (newLineInSeparator == -1) { - return newLineEnd + (sep ? separator.length - 1 : 0); - } else { - int limit = Math.max(0, j - newLineInSeparator); - while (j >= limit && !sep) { - sep = separatorAt(--j); - } - return sep ? j + separator.length : newLineEnd; - } - } - - @Override - protected int linesAfterStart(int i, int maxLines) { - int j = globalPosition(i) + patternLength(); - int lineCount = 0; - boolean sep = false; - while (j < data.length && !(sep = separatorAt(j)) && lineCount <= maxLines) { - if (isNewLine(j, data) > 0) { - lineCount++; - } - j++; - } - return j == data.length || sep ? j : j - 1; - } - } - -} diff --git a/server/src/main/java/sk/linhard/exactly/impl/SafeHitContext.java b/server/src/main/java/sk/linhard/exactly/impl/SafeHitContext.java deleted file mode 100644 index 20f69eb..0000000 --- a/server/src/main/java/sk/linhard/exactly/impl/SafeHitContext.java +++ /dev/null @@ -1,47 +0,0 @@ -package sk.linhard.exactly.impl; - -import java.nio.charset.Charset; - -import org.bouncycastle.util.Arrays; - -/** - * Removes special characters from context for better display in some cases - * - */ -public class SafeHitContext extends DefaultHitContext { - - SafeHitContext(DefaultSearch search, int ctxPosition, int beforeLength, int patternLength, int afterLength) { - super(search, ctxPosition, beforeLength, patternLength, afterLength); - } - - private static byte[] clean(byte[] data) { - for (int i = 0; i < data.length; i++) { - data[i] = clean(data[i]); - } - return data; - } - - private static byte clean(byte c) { - return c == 127 || c < 32 ? 32 : c; - } - - public static String toSafeString(byte[] bytes, Charset charset) { - byte[] cleanCopy = clean(Arrays.copyOf(bytes, bytes.length)); - return new String(cleanCopy, charset); - } - - @Override - public byte[] before() { - return clean(super.before()); - } - - @Override - public byte[] pattern() { - return clean(super.pattern()); - } - - @Override - public byte[] after() { - return clean(super.after()); - } -} diff --git a/server/src/main/java/sk/linhard/exactly/impl/sais.java b/server/src/main/java/sk/linhard/exactly/impl/sais.java deleted file mode 100644 index d39df17..0000000 --- a/server/src/main/java/sk/linhard/exactly/impl/sais.java +++ /dev/null @@ -1,449 +0,0 @@ -package sk.linhard.exactly.impl; -/* - * sais.java for sais-java - * Copyright (c) 2008-2010 Yuta Mori All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -import java.lang.String; - -@SuppressWarnings("all") -public class sais { - private static interface BaseArray { - public int get(int i); - public void set(int i, int val); - public int update(int i, int val); - } - private static class ByteArray implements BaseArray { - private byte[] m_A = null; - private int m_pos = 0; - ByteArray(byte[] A, int pos) { m_A = A; m_pos = pos; } - public int get(int i) { return m_A[m_pos + i] & 0xff; } - public void set(int i, int val) { m_A[m_pos + i] = (byte)(val & 0xff); } - public int update(int i, int val) { return m_A[m_pos + i] += val & 0xff; } - } - private static class CharArray implements BaseArray { - private char[] m_A = null; - private int m_pos = 0; - CharArray(char[] A, int pos) { m_A = A; m_pos = pos; } - public int get(int i) { return m_A[m_pos + i] & 0xffff; } - public void set(int i, int val) { m_A[m_pos + i] = (char)(val & 0xffff); } - public int update(int i, int val) { return m_A[m_pos + i] += val & 0xffff; } - } - private static class ShortArray implements BaseArray { - private short[] m_A = null; - private int m_pos = 0; - ShortArray(short[] A, int pos) { m_A = A; m_pos = pos; } - public int get(int i) { return m_A[m_pos + i] & 0xffff; } - public void set(int i, int val) { m_A[m_pos + i] = (short)(val & 0xffff); } - public int update(int i, int val) { return m_A[m_pos + i] += val & 0xffff; } - } - private static class IntArray implements BaseArray { - private int[] m_A = null; - private int m_pos = 0; - IntArray(int[] A, int pos) { m_A = A; m_pos = pos; } - public int get(int i) { return m_A[m_pos + i]; } - public void set(int i, int val) { m_A[m_pos + i] = val; } - public int update(int i, int val) { return m_A[m_pos + i] += val; } - } - private static class StringArray implements BaseArray { - private String m_A = null; - private int m_pos = 0; - StringArray(String A, int pos) { m_A = A; m_pos = pos; } - public int get(int i) { return (int)(m_A.charAt(m_pos + i) & 0xffff); } - public void set(int i, int val) { } - public int update(int i, int val) { return 0; } - } - - /* find the start or end of each bucket */ - private static - void - getCounts(BaseArray T, BaseArray C, int n, int k) { - int i; - for(i = 0; i < k; ++i) { C.set(i, 0); } - for(i = 0; i < n; ++i) { C.update(T.get(i), 1); } - } - private static - void - getBuckets(BaseArray C, BaseArray B, int k, boolean end) { - int i, sum = 0; - if(end != false) { for(i = 0; i < k; ++i) { sum += C.get(i); B.set(i, sum); } } - else { for(i = 0; i < k; ++i) { sum += C.get(i); B.set(i, sum - C.get(i)); } } - } - - /* sort all type LMS suffixes */ - private static - void - LMSsort(BaseArray T, int[] SA, BaseArray C, BaseArray B, int n, int k) { - int b, i, j; - int c0, c1; - /* compute SAl */ - if(C == B) { getCounts(T, C, n, k); } - getBuckets(C, B, k, false); /* find starts of buckets */ - j = n - 1; - b = B.get(c1 = T.get(j)); - --j; - SA[b++] = (T.get(j) < c1) ? ~j : j; - for(i = 0; i < n; ++i) { - if(0 < (j = SA[i])) { - if((c0 = T.get(j)) != c1) { B.set(c1, b); b = B.get(c1 = c0); } - --j; - SA[b++] = (T.get(j) < c1) ? ~j : j; - SA[i] = 0; - } else if(j < 0) { - SA[i] = ~j; - } - } - /* compute SAs */ - if(C == B) { getCounts(T, C, n, k); } - getBuckets(C, B, k, true); /* find ends of buckets */ - for(i = n - 1, b = B.get(c1 = 0); 0 <= i; --i) { - if(0 < (j = SA[i])) { - if((c0 = T.get(j)) != c1) { B.set(c1, b); b = B.get(c1 = c0); } - --j; - SA[--b] = (T.get(j) > c1) ? ~(j + 1) : j; - SA[i] = 0; - } - } - } - private static - int - LMSpostproc(BaseArray T, int[] SA, int n, int m) { - int i, j, p, q, plen, qlen, name; - int c0, c1; - boolean diff; - - /* compact all the sorted substrings into the first m items of SA - 2*m must be not larger than n (proveable) */ - for(i = 0; (p = SA[i]) < 0; ++i) { SA[i] = ~p; } - if(i < m) { - for(j = i, ++i;; ++i) { - if((p = SA[i]) < 0) { - SA[j++] = ~p; SA[i] = 0; - if(j == m) { break; } - } - } - } - - /* store the length of all substrings */ - i = n - 1; j = n - 1; c0 = T.get(n - 1); - do { c1 = c0; } while((0 <= --i) && ((c0 = T.get(i)) >= c1)); - for(; 0 <= i;) { - do { c1 = c0; } while((0 <= --i) && ((c0 = T.get(i)) <= c1)); - if(0 <= i) { - SA[m + ((i + 1) >> 1)] = j - i; j = i + 1; - do { c1 = c0; } while((0 <= --i) && ((c0 = T.get(i)) >= c1)); - } - } - - /* find the lexicographic names of all substrings */ - for(i = 0, name = 0, q = n, qlen = 0; i < m; ++i) { - p = SA[i]; plen = SA[m + (p >> 1)]; diff = true; - if((plen == qlen) && ((q + plen) < n)) { - for(j = 0; (j < plen) && (T.get(p + j) == T.get(q + j)); ++j) { } - if(j == plen) { diff = false; } - } - if(diff != false) { ++name; q = p; qlen = plen; } - SA[m + (p >> 1)] = name; - } - - return name; - } - - /* compute SA and BWT */ - private static - void - induceSA(BaseArray T, int[] SA, BaseArray C, BaseArray B, int n, int k) { - int b, i, j; - int c0, c1; - /* compute SAl */ - if(C == B) { getCounts(T, C, n, k); } - getBuckets(C, B, k, false); /* find starts of buckets */ - j = n - 1; - b = B.get(c1 = T.get(j)); - SA[b++] = ((0 < j) && (T.get(j - 1) < c1)) ? ~j : j; - for(i = 0; i < n; ++i) { - j = SA[i]; SA[i] = ~j; - if(0 < j) { - if((c0 = T.get(--j)) != c1) { B.set(c1, b); b = B.get(c1 = c0); } - SA[b++] = ((0 < j) && (T.get(j - 1) < c1)) ? ~j : j; - } - } - /* compute SAs */ - if(C == B) { getCounts(T, C, n, k); } - getBuckets(C, B, k, true); /* find ends of buckets */ - for(i = n - 1, b = B.get(c1 = 0); 0 <= i; --i) { - if(0 < (j = SA[i])) { - if((c0 = T.get(--j)) != c1) { B.set(c1, b); b = B.get(c1 = c0); } - SA[--b] = ((j == 0) || (T.get(j - 1) > c1)) ? ~j : j; - } else { - SA[i] = ~j; - } - } - } - private static - int - computeBWT(BaseArray T, int[] SA, BaseArray C, BaseArray B, int n, int k) { - int b, i, j, pidx = -1; - int c0, c1; - /* compute SAl */ - if(C == B) { getCounts(T, C, n, k); } - getBuckets(C, B, k, false); /* find starts of buckets */ - j = n - 1; - b = B.get(c1 = T.get(j)); - SA[b++] = ((0 < j) && (T.get(j - 1) < c1)) ? ~j : j; - for(i = 0; i < n; ++i) { - if(0 < (j = SA[i])) { - SA[i] = ~(c0 = T.get(--j)); - if(c0 != c1) { B.set(c1, b); b = B.get(c1 = c0); } - SA[b++] = ((0 < j) && (T.get(j - 1) < c1)) ? ~j : j; - } else if(j != 0) { - SA[i] = ~j; - } - } - /* compute SAs */ - if(C == B) { getCounts(T, C, n, k); } - getBuckets(C, B, k, true); /* find ends of buckets */ - for(i = n - 1, b = B.get(c1 = 0); 0 <= i; --i) { - if(0 < (j = SA[i])) { - SA[i] = (c0 = T.get(--j)); - if(c0 != c1) { B.set(c1, b); b = B.get(c1 = c0); } - SA[--b] = ((0 < j) && (T.get(j - 1) > c1)) ? ~((int)T.get(j - 1)) : j; - } else if(j != 0) { - SA[i] = ~j; - } else { - pidx = i; - } - } - return pidx; - } - - /* find the suffix array SA of T[0..n-1] in {0..k-1}^n - use a working space (excluding T and SA) of at most 2n+O(1) for a constant alphabet */ - private static - int - SA_IS(BaseArray T, int[] SA, int fs, int n, int k, boolean isbwt) { - BaseArray C, B, RA; - int i, j, b, c, m, p, q, name, pidx = 0, newfs; - int c0, c1; - int flags = 0; - - if(k <= 256) { - C = new IntArray(new int[k], 0); - if(k <= fs) { B = new IntArray(SA, n + fs - k); flags = 1; } - else { B = new IntArray(new int[k], 0); flags = 3; } - } else if(k <= fs) { - C = new IntArray(SA, n + fs - k); - if(k <= (fs - k)) { B = new IntArray(SA, n + fs - k * 2); flags = 0; } - else if(k <= 1024) { B = new IntArray(new int[k], 0); flags = 2; } - else { B = C; flags = 8; } - } else { - C = B = new IntArray(new int[k], 0); - flags = 4 | 8; - } - - /* stage 1: reduce the problem by at least 1/2 - sort all the LMS-substrings */ - getCounts(T, C, n, k); getBuckets(C, B, k, true); /* find ends of buckets */ - for(i = 0; i < n; ++i) { SA[i] = 0; } - b = -1; i = n - 1; j = n; m = 0; c0 = T.get(n - 1); - do { c1 = c0; } while((0 <= --i) && ((c0 = T.get(i)) >= c1)); - for(; 0 <= i;) { - do { c1 = c0; } while((0 <= --i) && ((c0 = T.get(i)) <= c1)); - if(0 <= i) { - if(0 <= b) { SA[b] = j; } b = B.update(c1, -1); j = i; ++m; - do { c1 = c0; } while((0 <= --i) && ((c0 = T.get(i)) >= c1)); - } - } - if(1 < m) { - LMSsort(T, SA, C, B, n, k); - name = LMSpostproc(T, SA, n, m); - } else if(m == 1) { - SA[b] = j + 1; - name = 1; - } else { - name = 0; - } - - /* stage 2: solve the reduced problem - recurse if names are not yet unique */ - if(name < m) { - if((flags & 4) != 0) { C = null; B = null; } - if((flags & 2) != 0) { B = null; } - newfs = (n + fs) - (m * 2); - if((flags & (1 | 4 | 8)) == 0) { - if((k + name) <= newfs) { newfs -= k; } - else { flags |= 8; } - } - for(i = m + (n >> 1) - 1, j = m * 2 + newfs - 1; m <= i; --i) { - if(SA[i] != 0) { SA[j--] = SA[i] - 1; } - } - RA = new IntArray(SA, m + newfs); - SA_IS(RA, SA, newfs, m, name, false); - RA = null; - - i = n - 1; j = m * 2 - 1; c0 = T.get(n - 1); - do { c1 = c0; } while((0 <= --i) && ((c0 = T.get(i)) >= c1)); - for(; 0 <= i;) { - do { c1 = c0; } while((0 <= --i) && ((c0 = T.get(i)) <= c1)); - if(0 <= i) { - SA[j--] = i + 1; - do { c1 = c0; } while((0 <= --i) && ((c0 = T.get(i)) >= c1)); - } - } - - for(i = 0; i < m; ++i) { SA[i] = SA[m + SA[i]]; } - if((flags & 4) != 0) { C = B = new IntArray(new int[k], 0); } - if((flags & 2) != 0) { B = new IntArray(new int[k], 0); } - } - - /* stage 3: induce the result for the original problem */ - if((flags & 8) != 0) { getCounts(T, C, n, k); } - /* put all left-most S characters into their buckets */ - if(1 < m) { - getBuckets(C, B, k, true); /* find ends of buckets */ - i = m - 1; j = n; p = SA[m - 1]; c1 = T.get(p); - do { - q = B.get(c0 = c1); - while(q < j) { SA[--j] = 0; } - do { - SA[--j] = p; - if(--i < 0) { break; } - p = SA[i]; - } while((c1 = T.get(p)) == c0); - } while(0 <= i); - while(0 < j) { SA[--j] = 0; } - } - if(isbwt == false) { induceSA(T, SA, C, B, n, k); } - else { pidx = computeBWT(T, SA, C, B, n, k); } - C = null; B = null; - return pidx; - } - - /** Suffixsorting **/ - /* byte */ - public static - int - suffixsort(byte[] T, int[] SA, int n) { - if((T == null) || (SA == null) || (T.length < n) || (SA.length < n)) { return -1; } - if(n <= 1) { if(n == 1) { SA[0] = 0; } return 0; } - return SA_IS(new ByteArray(T, 0), SA, 0, n, 256, false); - } - /* char */ - public static - int - suffixsort(char[] T, int[] SA, int n) { - if((T == null) || (SA == null) || (T.length < n) || (SA.length < n)) { return -1; } - if(n <= 1) { if(n == 1) { SA[0] = 0; } return 0; } - return SA_IS(new CharArray(T, 0), SA, 0, n, 65536, false); - } - /* short */ - public static - int - suffixsort(short[] T, int[] SA, int n, int k) { - if((T == null) || (SA == null) || - (T.length < n) || (SA.length < n) || - (k <= 0) || (65536 < k)) { return -1; } - if(n <= 1) { if(n == 1) { SA[0] = 0; } return 0; } - return SA_IS(new ShortArray(T, 0), SA, 0, n, k, false); - } - /* int */ - public static - int - suffixsort(int[] T, int[] SA, int n, int k) { - if((T == null) || (SA == null) || - (T.length < n) || (SA.length < n) || - (k <= 0)) { return -1; } - if(n <= 1) { if(n == 1) { SA[0] = 0; } return 0; } - return SA_IS(new IntArray(T, 0), SA, 0, n, k, false); - } - /* String */ - public static - int - suffixsort(String T, int[] SA, int n) { - if((T == null) || (SA == null) || - (T.length() < n) || (SA.length < n)) { return -1; } - if(n <= 1) { if(n == 1) { SA[0] = 0; } return 0; } - return SA_IS(new StringArray(T, 0), SA, 0, n, 65536, false); - } - - /** Burrows-Wheeler Transform **/ - /* byte */ - public static - int - bwtransform(byte[] T, byte[] U, int[] A, int n) { - int i, pidx; - if((T == null) || (U == null) || (A == null) || - (T.length < n) || (U.length < n) || (A.length < n)) { return -1; } - if(n <= 1) { if(n == 1) { U[0] = T[0]; } return n; } - pidx = SA_IS(new ByteArray(T, 0), A, 0, n, 256, true); - U[0] = T[n - 1]; - for(i = 0; i < pidx; ++i) { U[i + 1] = (byte)(A[i] & 0xff); } - for(i += 1; i < n; ++i) { U[i] = (byte)(A[i] & 0xff); } - return pidx + 1; - } - /* char */ - public static - int - bwtransform(char[] T, char[] U, int[] A, int n) { - int i, pidx; - if((T == null) || (U == null) || (A == null) || - (T.length < n) || (U.length < n) || (A.length < n)) { return -1; } - if(n <= 1) { if(n == 1) { U[0] = T[0]; } return n; } - pidx = SA_IS(new CharArray(T, 0), A, 0, n, 65536, true); - U[0] = T[n - 1]; - for(i = 0; i < pidx; ++i) { U[i + 1] = (char)(A[i] & 0xffff); } - for(i += 1; i < n; ++i) { U[i] = (char)(A[i] & 0xffff); } - return pidx + 1; - } - /* short */ - public static - int - bwtransform(short[] T, short[] U, int[] A, int n, int k) { - int i, pidx; - if((T == null) || (U == null) || (A == null) || - (T.length < n) || (U.length < n) || (A.length < n) || - (k <= 0) || (65536 < k)) { return -1; } - if(n <= 1) { if(n == 1) { U[0] = T[0]; } return n; } - pidx = SA_IS(new ShortArray(T, 0), A, 0, n, k, true); - U[0] = T[n - 1]; - for(i = 0; i < pidx; ++i) { U[i + 1] = (short)(A[i] & 0xffff); } - for(i += 1; i < n; ++i) { U[i] = (short)(A[i] & 0xffff); } - return pidx + 1; - } - /* int */ - public static - int - bwtransform(int[] T, int[] U, int[] A, int n, int k) { - int i, pidx; - if((T == null) || (U == null) || (A == null) || - (T.length < n) || (U.length < n) || (A.length < n) || - (k <= 0)) { return -1; } - if(n <= 1) { if(n == 1) { U[0] = T[0]; } return n; } - pidx = SA_IS(new IntArray(T, 0), A, 0, n, k, true); - U[0] = T[n - 1]; - for(i = 0; i < pidx; ++i) { U[i + 1] = A[i]; } - for(i += 1; i < n; ++i) { U[i] = A[i]; } - return pidx + 1; - } -} diff --git a/server/src/main/java/sk/linhard/exactly/lucene/LuceneSearch.java b/server/src/main/java/sk/linhard/exactly/lucene/LuceneSearch.java deleted file mode 100644 index 3afcf68..0000000 --- a/server/src/main/java/sk/linhard/exactly/lucene/LuceneSearch.java +++ /dev/null @@ -1,27 +0,0 @@ -package sk.linhard.exactly.lucene; - -import sk.linhard.exactly.Document; -import sk.linhard.exactly.Search; -import sk.linhard.exactly.SearchResult; - -public class LuceneSearch implements Search { - - @Override - public SearchResult find(String pattern) { - // TODO Auto-generated method stub - return null; - } - - @Override - public Document document(int i) { - // TODO Auto-generated method stub - return null; - } - - @Override - public int documentCount() { - // TODO Auto-generated method stub - return 0; - } - -} diff --git a/server/src/main/java/sk/linhard/exactly/lucene/LuceneSearchBuilder.java b/server/src/main/java/sk/linhard/exactly/lucene/LuceneSearchBuilder.java deleted file mode 100644 index 18676f3..0000000 --- a/server/src/main/java/sk/linhard/exactly/lucene/LuceneSearchBuilder.java +++ /dev/null @@ -1,53 +0,0 @@ -package sk.linhard.exactly.lucene; - -import java.io.BufferedReader; -import java.io.File; -import java.io.FileInputStream; -import java.io.IOException; -import java.io.InputStreamReader; -import java.nio.charset.Charset; - -import org.apache.lucene.analysis.standard.StandardAnalyzer; -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; -import org.apache.lucene.document.StringField; -import org.apache.lucene.document.TextField; -import org.apache.lucene.index.IndexWriter; -import org.apache.lucene.index.IndexWriterConfig; -import org.apache.lucene.store.Directory; -import org.apache.lucene.store.RAMDirectory; - -public class LuceneSearchBuilder { - - private StandardAnalyzer analyzer; - private Directory index; - private IndexWriterConfig config; - private IndexWriter w; - - public LuceneSearchBuilder() { - try { - analyzer = new StandardAnalyzer(); - index = new RAMDirectory(); - config = new IndexWriterConfig(analyzer); - w = new IndexWriter(index, config); - } catch (IOException e) { - throw new RuntimeException(e); - } - } - - public void addDocument(File file) { - try { - Document doc = new Document(); - doc.add(new TextField("text", - new BufferedReader(new InputStreamReader(new FileInputStream(file), Charset.forName("UTF-8"))))); - doc.add(new StringField("path", file.getAbsolutePath(), Field.Store.YES)); - w.addDocument(doc); - } catch (IOException e) { - throw new RuntimeException(e); - } - } - - public LuceneSearch build() { - return null; - } -} diff --git a/server/src/main/java/sk/linhard/exactly/rest/Application.java b/server/src/main/java/sk/linhard/exactly/rest/Application.java deleted file mode 100644 index de1f497..0000000 --- a/server/src/main/java/sk/linhard/exactly/rest/Application.java +++ /dev/null @@ -1,12 +0,0 @@ -package sk.linhard.exactly.rest; - -import org.springframework.boot.SpringApplication; -import org.springframework.boot.autoconfigure.SpringBootApplication; - -@SpringBootApplication -public class Application { - - public static void main(String[] args) { - SpringApplication.run(Application.class, args); - } -} \ No newline at end of file diff --git a/server/src/main/java/sk/linhard/exactly/rest/DocumentRequest.java b/server/src/main/java/sk/linhard/exactly/rest/DocumentRequest.java deleted file mode 100644 index 54ee549..0000000 --- a/server/src/main/java/sk/linhard/exactly/rest/DocumentRequest.java +++ /dev/null @@ -1,39 +0,0 @@ -package sk.linhard.exactly.rest; - -import org.json.JSONException; -import org.json.JSONObject; - -import com.fasterxml.jackson.annotation.JsonProperty; - -public class DocumentRequest { - - @JsonProperty("document_id") - private final String documentId; - - @JsonProperty("document_index") - private final Integer documentIndex; - - public DocumentRequest(String documentId, Integer documentIndex) { - this.documentId = documentId; - this.documentIndex = documentIndex; - } - - public DocumentRequest(String jsonStr) { - try { - JSONObject jsonObj = new JSONObject(jsonStr); - documentId = jsonObj.has("document_id") ? jsonObj.getString("document_id") : null; - documentIndex = jsonObj.has("document_index") ? jsonObj.getInt("document_index") : null; - } catch (JSONException e) { - throw new RuntimeException("Error parsing JSON", e); - } - } - - public String getDocumentId() { - return documentId; - } - - public Integer getDocumentIndex() { - return documentIndex; - } - -} diff --git a/server/src/main/java/sk/linhard/exactly/rest/DocumentResponse.java b/server/src/main/java/sk/linhard/exactly/rest/DocumentResponse.java deleted file mode 100644 index 75e4c93..0000000 --- a/server/src/main/java/sk/linhard/exactly/rest/DocumentResponse.java +++ /dev/null @@ -1,49 +0,0 @@ -package sk.linhard.exactly.rest; - -import java.util.Base64; - -import org.json.JSONException; -import org.json.JSONObject; - -import com.fasterxml.jackson.annotation.JsonProperty; - -public class DocumentResponse { - @JsonProperty("document_id") - private final String documentId; - - @JsonProperty("document_index") - private final Integer documentIndex; - - private final byte[] content; - - public DocumentResponse(String documentId, Integer documentIndex, byte[] content) { - this.documentId = documentId; - this.documentIndex = documentIndex; - this.content = content; - } - - public DocumentResponse(String jsonStr) { - try { - JSONObject jsonObj = new JSONObject(jsonStr); - documentId = jsonObj.getString("document_id"); - String docIdxObj = jsonObj.getString("document_index"); - documentIndex = docIdxObj == null ? null : Integer.parseInt(docIdxObj); - content = Base64.getDecoder().decode(jsonObj.getString("content")); - } catch (JSONException e) { - throw new RuntimeException("Error parsing JSON", e); - } - } - - public String getDocumentId() { - return documentId; - } - - public Integer getDocumentIndex() { - return documentIndex; - } - - public byte[] getContent() { - return content; - } - -} diff --git a/server/src/main/java/sk/linhard/exactly/rest/SearchController.java b/server/src/main/java/sk/linhard/exactly/rest/SearchController.java deleted file mode 100644 index dba42df..0000000 --- a/server/src/main/java/sk/linhard/exactly/rest/SearchController.java +++ /dev/null @@ -1,41 +0,0 @@ -package sk.linhard.exactly.rest; - -import org.springframework.beans.factory.annotation.Autowired; -import org.springframework.web.bind.annotation.PathVariable; -import org.springframework.web.bind.annotation.RequestBody; -import org.springframework.web.bind.annotation.RequestMapping; -import org.springframework.web.bind.annotation.RequestMethod; -import org.springframework.web.bind.annotation.RestController; - -@RestController -public class SearchController { - - @Autowired - private SearchServer server; - - @RequestMapping(value = "/search", method = RequestMethod.POST) - public SearchResponse search(@RequestBody SearchRequest request) { - return server.search(request); - } - - @RequestMapping(value = "/stats") - public SearchServerStats stats() { - return server.stats(); - } - - @RequestMapping(value = "/version") - public String version() { - return server.version(); - } - - @RequestMapping(value = "/document", method = RequestMethod.POST) - public DocumentResponse requestDocument(@RequestBody DocumentRequest request) { - return server.requestDocument(request); - } - - @RequestMapping(value = "/document/{idx}", method = RequestMethod.GET) - public DocumentResponse requestDocument(@PathVariable("idx") int idx) { - return server.requestDocument(new DocumentRequest(null, idx)); - } - -} \ No newline at end of file diff --git a/server/src/main/java/sk/linhard/exactly/rest/SearchRequest.java b/server/src/main/java/sk/linhard/exactly/rest/SearchRequest.java deleted file mode 100644 index 2cd37b6..0000000 --- a/server/src/main/java/sk/linhard/exactly/rest/SearchRequest.java +++ /dev/null @@ -1,58 +0,0 @@ -package sk.linhard.exactly.rest; - -import java.util.Base64; - -import org.json.JSONException; -import org.json.JSONObject; - -import com.fasterxml.jackson.annotation.JsonProperty; - -public class SearchRequest { - - private final byte[] pattern; - - @JsonProperty("max_hits") - private final int maxHits; - - @JsonProperty("max_context") - private final int maxContext; - - private final int offset; - - public SearchRequest(String jsonStr) { - try { - JSONObject jsonObj = new JSONObject(jsonStr); - pattern = Base64.getDecoder().decode(jsonObj.getString("pattern")); - maxHits = jsonObj.getInt("max_hits"); - maxContext = jsonObj.getInt("max_context"); - offset = jsonObj.has("offset") ? jsonObj.getInt("offset") : 0; - } catch (JSONException e) { - throw new RuntimeException("Error parsing JSON", e); - } - } - - public SearchRequest(byte[] pattern, int maxHits, int maxContext, int offset) { - super(); - this.pattern = pattern; - this.maxHits = maxHits; - this.maxContext = maxContext; - this.offset = offset; - } - - public byte[] getPattern() { - return pattern; - } - - public int getMaxHits() { - return maxHits; - } - - public int getMaxContext() { - return maxContext; - } - - public int getOffset() { - return offset; - } - -} diff --git a/server/src/main/java/sk/linhard/exactly/rest/SearchResponse.java b/server/src/main/java/sk/linhard/exactly/rest/SearchResponse.java deleted file mode 100644 index f8c3807..0000000 --- a/server/src/main/java/sk/linhard/exactly/rest/SearchResponse.java +++ /dev/null @@ -1,85 +0,0 @@ -package sk.linhard.exactly.rest; - -import java.util.List; - -import com.fasterxml.jackson.annotation.JsonProperty; - -public class SearchResponse { - - private final List hits; - private final Cursor cursor; - - public SearchResponse(List hits, Cursor cursor) { - this.hits = hits; - this.cursor = cursor; - } - - public List getHits() { - return hits; - } - - public Cursor getCursor() { - return cursor; - } - - public static class Hit { - - private final int pos; - - @JsonProperty("doc_id") - private final String docId; - - @JsonProperty("ctx_before") - private final byte[] ctxBefore; - - @JsonProperty("ctx_after") - private final byte[] ctxAfter; - - public Hit(int pos, String docId, byte[] ctxBefore, byte[] ctxAfter) { - this.pos = pos; - this.docId = docId; - this.ctxBefore = ctxBefore; - this.ctxAfter = ctxAfter; - } - - public int getPos() { - return pos; - } - - public String getDocId() { - return docId; - } - - public byte[] getCtxBefore() { - return ctxBefore; - } - - public byte[] getCtxAfter() { - return ctxAfter; - } - - } - - public static class Cursor { - - @JsonProperty("complete_size") - private final int completeSize; - - private final int offset; - - public Cursor(int completeSize, int offset) { - this.completeSize = completeSize; - this.offset = offset; - } - - public int getCompleteSize() { - return completeSize; - } - - public int getOffset() { - return offset; - } - - } - -} diff --git a/server/src/main/java/sk/linhard/exactly/rest/SearchServer.java b/server/src/main/java/sk/linhard/exactly/rest/SearchServer.java deleted file mode 100644 index f20db91..0000000 --- a/server/src/main/java/sk/linhard/exactly/rest/SearchServer.java +++ /dev/null @@ -1,171 +0,0 @@ -package sk.linhard.exactly.rest; - -import java.io.File; -import java.io.InputStream; -import java.nio.charset.Charset; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; - -import javax.annotation.PostConstruct; -import javax.ws.rs.NotFoundException; - -import org.apache.commons.io.IOUtils; -import org.apache.commons.lang3.tuple.Pair; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.springframework.beans.factory.annotation.Autowired; -import org.springframework.context.annotation.Scope; -import org.springframework.core.io.ClassPathResource; -import org.springframework.stereotype.Component; - -import com.google.common.collect.ImmutableList; - -import sk.linhard.exactly.Document; -import sk.linhard.exactly.Hit; -import sk.linhard.exactly.HitContext; -import sk.linhard.exactly.Search; -import sk.linhard.exactly.SearchResult; -import sk.linhard.exactly.impl.FileLoader; -import sk.linhard.exactly.impl.IndexingProgressReporter; -import sk.linhard.exactly.impl.IndexingProgressReporter.IndexingProgress; -import sk.linhard.exactly.impl.SafeHitContext; -import sk.linhard.exactly.rest.SearchResponse.Cursor; - -@Scope(value = "singleton") -@Component -public class SearchServer { - - private static final Logger log = LoggerFactory.getLogger(SearchServer.class); - - @Autowired - private SearchServerConfig config; - - private ExecutorService executor = Executors.newCachedThreadPool(); - - private IndexingProgressReporter reporter = new IndexingProgressReporter(); - - private Pair, Map> search; - - private String cachedVersion; - - public IndexingProgress getProgress() { - return reporter.getProgress(); - } - - @PostConstruct - public void start() { - log.info("Root folder: {}", config.getIndexedFolderRoot().getAbsolutePath()); - executor.submit(() -> { - indexFolder(config.getIndexedFolderRoot(), reporter); - }); - } - - private void indexFolder(File folder, IndexingProgressReporter reporter) { - log.info("Started indexing folder {}", folder.getAbsolutePath()); - synchronized (SearchServer.this) { - search = null; - } - FileLoader loader = new FileLoader(ImmutableList.of(folder), reporter); - loader.crawl(); - loader.load(); - Search aSearch = loader.indexBinary(); - Map anIdToIdx = computeIdToIdx(aSearch); - synchronized (SearchServer.this) { - search = Pair.of(aSearch, anIdToIdx); - } - IndexingProgress progress = reporter.getProgress(); - log.info("Done indexing {} files, {} bytes total", progress.getFormattedLoadingProgressFiles(), - progress.getFormattedLoadingProgressBytes()); - } - - // TODO: this is lame, search by document ID should be a suffix array search - // as well - // suffix array could be sparse (we only care about the whole id string, not - // search inside) - private Map computeIdToIdx(Search aSearch) { - Map anIdToIdx = new HashMap<>(); - int n = aSearch.documentCount(); - for (int i = 0; i < n; i++) { - Document doc = aSearch.document(i); - if (doc.index() != i) { - throw new IllegalStateException("Wrong document index " + i + " vs " + doc.index()); - } - anIdToIdx.put(doc.id(), i); - } - return anIdToIdx; - } - - private synchronized Pair, Map> checkSearch() { - if (search == null) { - throw new RuntimeException("Search not ready yet"); - } - return search; - } - - public SearchResponse search(SearchRequest request) { - Pair, Map> search = checkSearch(); - SearchResult searchResult = search.getKey().find(request.getPattern()); - List hits = new ArrayList<>(searchResult.size()); - Cursor cursor = null; - for (Hit hit : searchResult.skipIterator(request.getOffset())) { - HitContext ctx = hit.charContext(request.getMaxContext(), request.getMaxContext()); - hits.add(new SearchResponse.Hit(hit.position(), hit.document().id(), ctx.before(), ctx.after())); - if (hits.size() >= request.getMaxHits()) { - break; - } - } - if (hits.size() < searchResult.size()) { - cursor = new Cursor(searchResult.size(), request.getOffset()); - } - - if (log.isDebugEnabled()) { - String safePattern = SafeHitContext.toSafeString(request.getPattern(), Charset.forName("UTF-8")); - log.debug("Search for '{}', max_hits={}, max_ctx={}, offset={}, returned {} hits{}", // - safePattern, // - request.getMaxHits(), // - request.getMaxContext(), // - request.getOffset(), // - hits.size(), // - cursor == null ? "" : " of " + searchResult.size()); - } - return new SearchResponse(hits, cursor); - } - - public SearchServerStats stats() { - return new SearchServerStats(getProgress()); - } - - public DocumentResponse requestDocument(DocumentRequest request) { - Integer documentIndex = request.getDocumentIndex(); - if (documentIndex == null) { - documentIndex = search.getValue().get(request.getDocumentId()); - } - if (documentIndex == null) { - throw new NotFoundException(); - } - Document document = search.getKey().document(documentIndex); - return new DocumentResponse(document.id(), document.index(), document.content()); - } - - public String version() { - if (cachedVersion == null) { - try { - InputStream in = new ClassPathResource("VERSION").getInputStream(); - if (in == null) { - log.error("Can't find VERSION file"); - return "UNKNOWN"; - } else { - cachedVersion = IOUtils.toString(in, Charset.forName("UTF-8")); - } - } catch (Exception e) { - log.error("Error while discovering version", e); - return "UNKNOWN"; - } - } - return cachedVersion; - } -} diff --git a/server/src/main/java/sk/linhard/exactly/rest/SearchServerConfig.java b/server/src/main/java/sk/linhard/exactly/rest/SearchServerConfig.java deleted file mode 100644 index a352a7a..0000000 --- a/server/src/main/java/sk/linhard/exactly/rest/SearchServerConfig.java +++ /dev/null @@ -1,17 +0,0 @@ -package sk.linhard.exactly.rest; - -import java.io.File; - -import org.springframework.beans.factory.annotation.Value; -import org.springframework.context.annotation.Configuration; - -@Configuration -public class SearchServerConfig { - - @Value("${root.folder}") - private String indexedFolderRoot; - - public File getIndexedFolderRoot() { - return new File(indexedFolderRoot); - } -} diff --git a/server/src/main/java/sk/linhard/exactly/rest/SearchServerStats.java b/server/src/main/java/sk/linhard/exactly/rest/SearchServerStats.java deleted file mode 100644 index 8534bab..0000000 --- a/server/src/main/java/sk/linhard/exactly/rest/SearchServerStats.java +++ /dev/null @@ -1,62 +0,0 @@ -package sk.linhard.exactly.rest; - -import com.fasterxml.jackson.annotation.JsonProperty; - -import sk.linhard.exactly.impl.IndexingProgressReporter.IndexingProgress; - -public class SearchServerStats { - - @JsonProperty("indexed_bytes") - private final int indexedBytes; - - @JsonProperty("indexed_files") - private final int indexedFiles; - - @JsonProperty("done_crawling") - private final boolean doneCrawling; - - @JsonProperty("done_loading") - private final boolean doneLoading; - - @JsonProperty("done_indexing") - private final boolean doneIndexing; - - public SearchServerStats(IndexingProgress progress) { - indexedBytes = progress.getLoadingProgressBytes(); - indexedFiles = progress.getLoadingProgressFiles(); - doneCrawling = progress.isDoneCrawling(); - doneLoading = progress.isDoneLoading(); - doneIndexing = progress.isDoneIndexing(); - } - - public SearchServerStats(int indexedBytes, int indexedFiles, boolean doneCrawling, boolean doneLoading, - boolean doneIndexing) { - super(); - this.indexedBytes = indexedBytes; - this.indexedFiles = indexedFiles; - this.doneCrawling = doneCrawling; - this.doneLoading = doneLoading; - this.doneIndexing = doneIndexing; - } - - public int getIndexedBytes() { - return indexedBytes; - } - - public int getIndexedFiles() { - return indexedFiles; - } - - public boolean isDoneCrawling() { - return doneCrawling; - } - - public boolean isDoneLoading() { - return doneLoading; - } - - public boolean isDoneIndexing() { - return doneIndexing; - } - -} diff --git a/server/src/main/java/sk/linhard/exactly/tika/ParseResult.java b/server/src/main/java/sk/linhard/exactly/tika/ParseResult.java deleted file mode 100644 index 392d75f..0000000 --- a/server/src/main/java/sk/linhard/exactly/tika/ParseResult.java +++ /dev/null @@ -1,57 +0,0 @@ -package sk.linhard.exactly.tika; - -import java.io.File; - -public class ParseResult implements Comparable { - - private final Type type; - private final File file; - private final long fileSize; - private final int textLenght; - private final String contentSample; - private final long parseTime; - - public ParseResult(Type type, File file, long fileSize, int textLenght, String contentSample, long parseTime) { - super(); - this.type = type; - this.file = file; - this.fileSize = fileSize; - this.textLenght = textLenght; - this.contentSample = contentSample; - this.parseTime = parseTime; - } - - public Type getType() { - return type; - } - - public File getFile() { - return file; - } - - public long getFileSize() { - return fileSize; - } - - public Integer getTextLenght() { - return textLenght; - } - - public String getContentSample() { - return contentSample; - } - - public long getParseTime() { - return parseTime; - } - - public static enum Type { - PARSED, BIG, EMPTY, ERROR; - } - - @Override - public int compareTo(ParseResult o) { - return this.file.compareTo(o.file); - } - -} diff --git a/server/src/main/java/sk/linhard/exactly/tika/Search.java b/server/src/main/java/sk/linhard/exactly/tika/Search.java deleted file mode 100644 index a0e3bee..0000000 --- a/server/src/main/java/sk/linhard/exactly/tika/Search.java +++ /dev/null @@ -1,45 +0,0 @@ -package sk.linhard.exactly.tika; - -import java.io.File; -import java.io.PrintWriter; -import java.util.Collections; -import java.util.List; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.google.common.collect.ImmutableList; - -public class Search { - private static final Logger log = LoggerFactory.getLogger(Search.class); - - public static void main(String[] args) throws Exception { - - List roots = ImmutableList.of(new File(args[0])); - TikaFsCrawler crawler = new TikaFsCrawler(roots); - List results = crawler.crawl(); - - log.info("Extracted {} bytes of text data from {} files", crawler.getTotalSize(), results.size()); - - Collections.sort(results); - - PrintWriter w = new PrintWriter(args[1]); - w.println("RESULT;PARSE_TIME;TEXT_SIZE;FILE_SIZE;FILE;SAMPLE"); - for (ParseResult res : results) { - w.print(res.getType().toString()); - w.print(";"); - w.print(Long.toString(res.getParseTime())); - w.print(";"); - w.print(Integer.toString(res.getTextLenght())); - w.print(";"); - w.print(Long.toString(res.getFileSize())); - w.print(";"); - w.print(res.getFile().getAbsolutePath()); - w.print(";"); - w.println(res.getContentSample() == null ? "" : res.getContentSample()); - } - w.flush(); - w.close(); - } - -} diff --git a/server/src/main/java/sk/linhard/exactly/tika/TikaFsCrawler.java b/server/src/main/java/sk/linhard/exactly/tika/TikaFsCrawler.java deleted file mode 100644 index 652eefd..0000000 --- a/server/src/main/java/sk/linhard/exactly/tika/TikaFsCrawler.java +++ /dev/null @@ -1,96 +0,0 @@ -package sk.linhard.exactly.tika; - -import java.io.File; -import java.io.FileInputStream; -import java.io.IOException; -import java.io.InputStream; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; -import java.util.concurrent.TimeUnit; - -import org.apache.commons.io.FileUtils; -import org.apache.commons.lang.StringUtils; -import org.apache.tika.Tika; -import org.apache.tika.exception.TikaException; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import sk.linhard.exactly.tika.ParseResult.Type; - -public class TikaFsCrawler { - - private static final long SIZE_LIMIT = 100_000_000l; - private static final Logger log = LoggerFactory.getLogger(TikaFsCrawler.class); - private final Tika tika; - private final List roots; - private long totalSize = 0l; - - public TikaFsCrawler(List roots) { - this.tika = new Tika(); - this.roots = roots; - } - - public List crawl() { - totalSize = 0l; - if (roots == null || roots.isEmpty()) { - return Collections.emptyList(); - } - List collector = new ArrayList<>(); - for (File file : roots) { - if (file.isDirectory()) { - collector.addAll(crawlDirectory(file)); - } else { - collector.add(parseContent(file)); - } - } - return collector; - } - - private List crawlDirectory(File dir) { - if (dir == null) { - return Collections.emptyList(); - } - List collector = new ArrayList<>(); - for (File file : dir.listFiles()) { - if (file.isDirectory()) { - collector.addAll(crawlDirectory(file)); - } else { - collector.add(parseContent(file)); - } - } - return collector; - } - - private ParseResult parseContent(File file) { - log.info("Parsing {}", file.getAbsolutePath()); - long fileSize = FileUtils.sizeOf(file); - if (fileSize > SIZE_LIMIT) { - return new ParseResult(Type.BIG, file, fileSize, 0, null, 0); - } - long nanoBefore = System.nanoTime(); - try (InputStream stream = new FileInputStream(file)) { - String s = tika.parseToString(stream); - long duration = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - nanoBefore); - String stripped = StringUtils.stripToNull(StringUtils.normalizeSpace(s)); - if (stripped == null) { - return new ParseResult(Type.EMPTY, file, fileSize, 0, null, duration); - } else { - int len = stripped.length(); - if (len > 200) { - stripped = stripped.substring(0, 200); - } - totalSize += fileSize; - return new ParseResult(Type.PARSED, file, fileSize, len, stripped.replaceAll(";", "?"), duration); - } - } catch (IOException | TikaException e) { - long duration = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - nanoBefore); - return new ParseResult(Type.ERROR, file, fileSize, 0, null, duration); - } - } - - public long getTotalSize() { - return totalSize; - } - -} diff --git a/server/src/main/resources/VERSION b/server/src/main/resources/VERSION deleted file mode 100644 index d519d03..0000000 --- a/server/src/main/resources/VERSION +++ /dev/null @@ -1 +0,0 @@ -@project.version@ \ No newline at end of file diff --git a/server/src/main/resources/application.properties b/server/src/main/resources/application.properties deleted file mode 100644 index 0064508..0000000 --- a/server/src/main/resources/application.properties +++ /dev/null @@ -1,2 +0,0 @@ -server.port = ${port:9201} -root.folder = ${dir:.} \ No newline at end of file diff --git a/server/src/test/java/sk/linhard/exactly/gui/WindowTests.java b/server/src/test/java/sk/linhard/exactly/gui/WindowTests.java deleted file mode 100644 index a5f6f1b..0000000 --- a/server/src/test/java/sk/linhard/exactly/gui/WindowTests.java +++ /dev/null @@ -1,8 +0,0 @@ -package sk.linhard.exactly.gui; - -public class WindowTests { - - public static void main(String[] args) { - - } -} diff --git a/server/src/test/java/sk/linhard/exactly/impl/BasicTest.java b/server/src/test/java/sk/linhard/exactly/impl/BasicTest.java deleted file mode 100644 index 3660260..0000000 --- a/server/src/test/java/sk/linhard/exactly/impl/BasicTest.java +++ /dev/null @@ -1,35 +0,0 @@ -package sk.linhard.exactly.impl; - -import java.util.Arrays; - -import org.junit.Assert; -import org.junit.Test; - -public class BasicTest { - @Test - public void binarySearchTest() { - int[] a = new int[] { 0, 12, 15, 16, 81 }; - Assert.assertEquals(-1, Arrays.binarySearch(a, -1)); - Assert.assertEquals(0, Arrays.binarySearch(a, 0)); - Assert.assertEquals(-2, Arrays.binarySearch(a, 1)); - Assert.assertEquals(-2, Arrays.binarySearch(a, 2)); - Assert.assertEquals(-2, Arrays.binarySearch(a, 11)); - Assert.assertEquals(1, Arrays.binarySearch(a, 12)); - Assert.assertEquals(-3, Arrays.binarySearch(a, 13)); - Assert.assertEquals(-3, Arrays.binarySearch(a, 14)); - Assert.assertEquals(2, Arrays.binarySearch(a, 15)); - Assert.assertEquals(3, Arrays.binarySearch(a, 16)); - Assert.assertEquals(-5, Arrays.binarySearch(a, 17)); - Assert.assertEquals(-5, Arrays.binarySearch(a, 80)); - Assert.assertEquals(4, Arrays.binarySearch(a, 81)); - Assert.assertEquals(-6, Arrays.binarySearch(a, 82)); - } - - @Test - public void byteEqualityTest() { - Byte b1 = new Byte((byte) 1); - Byte b2 = new Byte((byte) 1); - Assert.assertEquals(b1, b2); - Assert.assertFalse(b1 == b2); - } -} diff --git a/server/src/test/java/sk/linhard/exactly/impl/ByteArrayTests.java b/server/src/test/java/sk/linhard/exactly/impl/ByteArrayTests.java deleted file mode 100644 index 89aaff3..0000000 --- a/server/src/test/java/sk/linhard/exactly/impl/ByteArrayTests.java +++ /dev/null @@ -1,142 +0,0 @@ -package sk.linhard.exactly.impl; - -import static sk.linhard.exactly.impl.TestUtil.bytes; -import static sk.linhard.exactly.impl.TestUtil.randomBytes; - -import java.util.Arrays; - -import org.junit.Assert; -import org.junit.Test; - -import sk.linhard.exactly.Search; -import sk.linhard.exactly.SearchBuilder; - -public class ByteArrayTests { - - private SearchBuilder searchBuilder = new SearchBuilder(); - private Search search; - private SearchResultChecker srChecker; - - protected void document(byte[] document) { - searchBuilder.add(Integer.toString(searchBuilder.size()), document); - } - - protected void document(int... document) { - searchBuilder.add(Integer.toString(searchBuilder.size()), TestUtil.bytes(document)); - } - - protected SearchResultChecker find(byte[] pattern) { - return (srChecker = new SearchResultChecker(search.find(pattern))); - } - - protected SearchResultChecker find(int... pattern) { - return (srChecker = new SearchResultChecker(search.find(TestUtil.bytes(pattern)))); - } - - protected void buildSearch() { - search = searchBuilder.build(); - } - - protected void buildSearch(int... separatorBytes) { - search = searchBuilder.build(separatorBytes.length == 0 ? null : bytes(separatorBytes)); - } - - @Test - public void testEmpty() { - document(); - buildSearch(); - find(1).assertEmpty(); - try { - find(); - Assert.fail(); - } catch (RuntimeException e) { - // ok - } - try { - find((byte[]) null); - Assert.fail(); - } catch (RuntimeException e) { - // ok - } - } - - @Test - public void testRandomMegabyte() { - byte[] randomBytes = randomBytes(1024 * 1024); // 1mb of random data - document(randomBytes); - buildSearch(); - for (int i = 0; i < 1024; i++) { - int position = i * 1024; - byte[] chunk = Arrays.copyOfRange(randomBytes, position, position + 1024); - find(chunk).assertHasGlobalPosition(position); - } - } - - @Test - public void testZeroMegabyte() { - document(new byte[1024 * 1024]); // 1mb of zeroes - buildSearch(); - SearchResultChecker sr = find(new byte[1024]); - for (int i = 0; i < 1024; i++) { - sr.assertHasGlobalPosition(i); - } - } - - @Test - public void testLinesBinary() { - assertLinesAboveBelowScenario(); - } - - @Test - public void testLinesBinaryNewLineInSeparator1() { - assertLinesAboveBelowScenario(-128, 13, 10); - } - - @Test - public void testLinesBinaryNewLineInSeparator2() { - assertLinesAboveBelowScenario(-128, 13, 10, -128); - } - - @Test - public void testLinesBinaryNewLineInSeparator3() { - assertLinesAboveBelowScenario(10, 13); - } - - private void assertLinesAboveBelowScenario(int... separator) { - document(0, 0, 0, 13, 1, 1, 1, 13, 2, 2, 2, 13, 3, 3, 3, 13, 4, 4, 4); - document(0, 0, 0, 13, 10, 1, 1, 1, 13, 10, 2, 2, 2, 13, 10, 3, 3, 3, 13, 10, 4, 4, 4); - - buildSearch(separator); - - find(2, 2, 2); - - assertLinesAbove(0, 8, 0); - assertLinesAbove(0, 8, 1, 1, 1, 1, 13); - assertLinesAbove(0, 8, 2, 0, 0, 0, 13, 1, 1, 1, 13); - assertLinesAbove(0, 8, 3, 0, 0, 0, 13, 1, 1, 1, 13); - - assertLinesAbove(1, 10, 0); - assertLinesAbove(1, 10, 1, 1, 1, 1, 13, 10); - assertLinesAbove(1, 10, 2, 0, 0, 0, 13, 10, 1, 1, 1, 13, 10); - assertLinesAbove(1, 10, 3, 0, 0, 0, 13, 10, 1, 1, 1, 13, 10); - - assertLinesBelow(0, 8, 0); - assertLinesBelow(0, 8, 1, 13, 3, 3, 3); - assertLinesBelow(0, 8, 2, 13, 3, 3, 3, 13, 4, 4, 4); - assertLinesBelow(0, 8, 3, 13, 3, 3, 3, 13, 4, 4, 4); - - assertLinesBelow(1, 10, 0); - assertLinesBelow(1, 10, 1, 13, 10, 3, 3, 3); - assertLinesBelow(1, 10, 2, 13, 10, 3, 3, 3, 13, 10, 4, 4, 4); - assertLinesBelow(1, 10, 3, 13, 10, 3, 3, 3, 13, 10, 4, 4, 4); - } - - private void assertLinesAbove(int doc, int pos, int maxLines, int... expectedBytes) { - srChecker.assertLinesAbove(doc, pos, maxLines, expectedBytes); - } - - private void assertLinesBelow(int doc, int pos, int maxLines, int... expectedBytes) { - srChecker.assertLinesBelow(doc, pos, maxLines, expectedBytes); - } - -} diff --git a/server/src/test/java/sk/linhard/exactly/impl/SASearchSeparatorTest.java b/server/src/test/java/sk/linhard/exactly/impl/SASearchSeparatorTest.java deleted file mode 100644 index a1c6fa8..0000000 --- a/server/src/test/java/sk/linhard/exactly/impl/SASearchSeparatorTest.java +++ /dev/null @@ -1,99 +0,0 @@ -package sk.linhard.exactly.impl; - -import static sk.linhard.exactly.impl.TestUtil.bytes; - -import org.junit.Assert; -import org.junit.Test; - -import sk.linhard.exactly.Search; -import sk.linhard.exactly.SearchBuilder; -import sk.linhard.exactly.SearchResult; -import sk.linhard.exactly.impl.DefaultSearch; -import sk.linhard.exactly.impl.EnhancedSuffixArray; -import sk.linhard.exactly.impl.MultiDocumentSearch; - -public class SASearchSeparatorTest { - - private DefaultSearch assertSeparator(byte[] input, int... expectedSeparatorInts) { - return assertSeparator(input, bytes(expectedSeparatorInts)); - } - - private DefaultSearch assertSeparator(byte[] input, byte[] expectedSeparator) { - MultiDocumentSearch s = MultiDocumentSearch.compute(input, new int[] { 0 }, new String[] { "0" }); - byte[] actualSeparator = s.findSeparator(); - Assert.assertArrayEquals(expectedSeparator, actualSeparator); - return s; - } - - @Test - public void testFindSeparator1() { - assertSeparator(TestUtil.bytes("a"), -128); - assertSeparator(new byte[] { -127 }, -128); - assertSeparator(new byte[] { -128 }, -127); - assertSeparator(new byte[] { -128, -127 }, -126); - assertSeparator(new byte[] { -128, -126 }, -127); - } - - @Test - public void testFindSeparator2() { - byte[] data = new byte[256 * 256 * 2]; - for (int i = 0; i < 256 * 256; i++) { - data[2 * i] = (byte) (i >> 8); - data[2 * i + 1] = (byte) (i); - } - MultiDocumentSearch s = MultiDocumentSearch.compute(data, new int[] { 0 }, new String[] { "0" }); - byte[] sep = s.findSeparator(); - Assert.assertEquals(0, s.find(sep).size()); - } - - @Test - public void testFindSeparator3() { - byte[] data = new byte[256]; - for (int i = 0; i < 256; i++) { - data[i] = (byte) i; - } - MultiDocumentSearch s = MultiDocumentSearch.compute(data, new int[] { 0 }, new String[] { "0" }); - byte[] sep = s.findSeparator(); - Assert.assertEquals(0, s.find(sep).size()); - } - - @Test - public void testFindSeparator4() { - MultiDocumentSearch s = MultiDocumentSearch.compute(TestUtil.randomBytes(1024), new int[] { 0 }, - new String[] { "0" }); - byte[] sep = s.findSeparator(); - Assert.assertEquals(0, s.find(sep).size()); - } - - // @Test - public void testMultiRandom() { - SearchBuilder builder = new SearchBuilder(); - byte[][] random = new byte[1024][]; - for (int i = 0; i < 1024; i++) { - random[i] = TestUtil.randomBytes(1024); - builder.add(Integer.toString(i), random[i]); - } - Search search = builder.build(); - for (int i = 0; i < random.length; i++) { - SearchResult r = search.find(random[i]); - Assert.assertTrue(r.hasPosition(i, 0)); - } - - } - - @Test - public void introduceSeparators() throws Exception { - EnhancedSuffixArray esa = new EnhancedSuffixArray(TestUtil.bytes("aabbcc")); - esa.computeLCP(true); - esa.computeUpDown(); - esa.computeNext(); - int[] offset = { 0, 2, 4 }; - esa.introduceSeparators(// - offset, // - TestUtil.bytes("dd")); - Assert.assertEquals("aaddbbddcc", new String(esa.data, "UTF-8")); - Assert.assertEquals(0, offset[0]); - Assert.assertEquals(4, offset[1]); - Assert.assertEquals(8, offset[2]); - } -} diff --git a/server/src/test/java/sk/linhard/exactly/impl/SearchResultChecker.java b/server/src/test/java/sk/linhard/exactly/impl/SearchResultChecker.java deleted file mode 100644 index 81ca188..0000000 --- a/server/src/test/java/sk/linhard/exactly/impl/SearchResultChecker.java +++ /dev/null @@ -1,61 +0,0 @@ -package sk.linhard.exactly.impl; - -import java.util.Arrays; -import java.util.HashSet; -import java.util.Set; -import java.util.stream.Collectors; - -import org.apache.commons.lang3.ArrayUtils; -import org.junit.Assert; - -import sk.linhard.exactly.Hit; -import sk.linhard.exactly.SearchResult; - -public class SearchResultChecker { - private SearchResult searchResult; - - public SearchResultChecker(SearchResult searchResult) { - this.searchResult = searchResult; - } - - public SearchResult result() { - return searchResult; - } - - public void assertEmpty() { - Assert.assertEquals(0, searchResult.size()); - } - - public void assertHasGlobalPosition(int position) { - Assert.assertTrue("Position " + position + " not found", searchResult.hasGlobalPosition(position)); - } - - private T require(T object, String message) { - Assert.assertNotNull(message, object); - return object; - } - - // TODO: rename before after - public void assertLinesAbove(int doc, int pos, int maxLines, int... expectedBytes) { - Hit hit = require(searchResult.hitWithPosition(doc, pos), - "Hit doc=" + doc + ", pos=" + pos + " not found"); - Assert.assertArrayEquals(TestUtil.bytes(expectedBytes), hit.lineContext(maxLines, 0).before()); - } - - public void assertLinesBelow(int doc, int pos, int maxLines, int... expectedBytes) { - Hit hit = require(searchResult.hitWithPosition(doc, pos), - "Hit doc=" + doc + ", pos=" + pos + " not found"); - Assert.assertArrayEquals(TestUtil.bytes(expectedBytes), hit.lineContext(0, maxLines).after()); - } - - public void assertGlobalPositions(int... position) { - Set expectedPositions = new HashSet<>(Arrays.asList(ArrayUtils.toObject(position))); - Set actualPositions = globalPositionSet(); - Assert.assertEquals(expectedPositions, actualPositions); - } - - public Set globalPositionSet() { - return searchResult.hits().stream().map(h -> h.globalPosition()).collect(Collectors.toSet()); - } - -} diff --git a/server/src/test/java/sk/linhard/exactly/impl/StringSearchResultChecker.java b/server/src/test/java/sk/linhard/exactly/impl/StringSearchResultChecker.java deleted file mode 100644 index cbd5d20..0000000 --- a/server/src/test/java/sk/linhard/exactly/impl/StringSearchResultChecker.java +++ /dev/null @@ -1,55 +0,0 @@ -package sk.linhard.exactly.impl; - -import java.util.Arrays; -import java.util.HashSet; -import java.util.Set; -import java.util.stream.Collectors; - -import org.apache.commons.lang3.ArrayUtils; -import org.junit.Assert; - -import sk.linhard.exactly.SearchResult; - -public class StringSearchResultChecker { - - private SearchResult searchResult; - - public StringSearchResultChecker(SearchResult searchResult) { - this.searchResult = searchResult; - } - - private Set set(int... position) { - return new HashSet<>(Arrays.asList(ArrayUtils.toObject(position))); - } - - public void assertGlobalPositions(int... expectedGlobalPositions) { - Assert.assertEquals(set(expectedGlobalPositions), globalPositionSet()); - } - - public void assertPositions(int... expectedPositions) { - Assert.assertEquals(set(expectedPositions), positionSet()); - } - - public Set globalPositionSet() { - return searchResult.hits().stream().map(h -> h.globalPosition()).collect(Collectors.toSet()); - } - - public Set positionSet() { - return searchResult.hits().stream().map(h -> h.position()).collect(Collectors.toSet()); - } - - public void assertSize(int expectedSize) { - Assert.assertEquals(expectedSize, searchResult.size()); - } - - public void assertLinesBefore(int maxLines, String expectedLinesAbove) { - assertSize(1); - Assert.assertEquals(expectedLinesAbove, searchResult.hit(0).lineContext(maxLines, 0).before()); - } - - public void assertLinesAfter(int maxLines, String expectedLinesBelow) { - assertSize(1); - Assert.assertEquals(expectedLinesBelow, searchResult.hit(0).lineContext(0, maxLines).after()); - } - -} diff --git a/server/src/test/java/sk/linhard/exactly/impl/StringTests.java b/server/src/test/java/sk/linhard/exactly/impl/StringTests.java deleted file mode 100644 index 8eff683..0000000 --- a/server/src/test/java/sk/linhard/exactly/impl/StringTests.java +++ /dev/null @@ -1,278 +0,0 @@ -package sk.linhard.exactly.impl; - -import org.junit.Assert; -import org.junit.Test; - -import sk.linhard.exactly.Hit; -import sk.linhard.exactly.HitContext; -import sk.linhard.exactly.Search; -import sk.linhard.exactly.SearchResult; -import sk.linhard.exactly.StringSearchBuilder; - -public class StringTests { - - private StringSearchBuilder searchBuilder = new StringSearchBuilder(); - private Search search; - private StringSearchResultChecker srChecker; - - protected void document(String document) { - searchBuilder.add(Integer.toString(searchBuilder.size()), document); - } - - protected void buildSearch() { - search = searchBuilder.build(); - } - - protected StringSearchResultChecker find(String pattern) { - return (srChecker = new StringSearchResultChecker(search.find(pattern))); - } - - protected void assertPositions(String pattern, int... position) { - find(pattern); - srChecker.assertPositions(position); - } - - protected Hit assertSingleHit(String pattern, int document, int position) { - SearchResult r = search.find(pattern); - Assert.assertEquals(1, r.size()); - Hit hit = r.hit(0); - Assert.assertEquals(document, hit.document().index()); - Assert.assertEquals(position, hit.position()); - return hit; - } - - protected void assertSingleHit(String pattern, int document, int position, int maxCtx, String leftCtx, - String rightCtx) { - Hit hit = assertSingleHit(pattern, document, position); - HitContext ctx = hit.charContext(maxCtx, maxCtx); - Assert.assertEquals(leftCtx, ctx.before()); - Assert.assertEquals(rightCtx, ctx.after()); - } - - @Test - public void testAbracadabra() { - document("abracadabra"); - buildSearch(); - assertPositions("abracadabra", 0); - assertPositions("bracadabra", 1); - assertPositions("racadabra", 2); - assertPositions("acadabra", 3); - assertPositions("cadabra", 4); - assertPositions("adabra", 5); - assertPositions("dabra", 6); - assertPositions("abra", 7, 0); - assertPositions("bra", 8, 1); - assertPositions("ra", 9, 2); - assertPositions("a", 10, 7, 0, 3, 5); - assertPositions("b", 8, 1); - assertPositions("c", 4); - assertPositions("d", 6); - assertPositions("r", 9, 2); - } - - @Test - public void testAcaaacatat() { - document("acaaacatat"); - buildSearch(); - assertPositions("acaaacatat", 0); - assertPositions("caaacatat", 1); - assertPositions("aaacatat", 2); - assertPositions("aacatat", 3); - assertPositions("acatat", 4); - assertPositions("catat", 5); - assertPositions("atat", 6); - assertPositions("tat", 7); - assertPositions("at", 8, 6); - assertPositions("t", 9, 7); - - assertPositions("acaaacatat", 0); - assertPositions("acaaacata", 0); - assertPositions("acaaacat", 0); - assertPositions("acaaaca", 0); - assertPositions("acaaac", 0); - assertPositions("acaaa", 0); - assertPositions("acaa", 0); - assertPositions("aca", 0, 4); - assertPositions("ac", 0, 4); - assertPositions("a", 2, 3, 0, 4, 8, 6); - - assertPositions("caaacatat", 1); - assertPositions("caaacata", 1); - assertPositions("caaacat", 1); - assertPositions("caaaca", 1); - assertPositions("caaac", 1); - assertPositions("caaa", 1); - assertPositions("caa", 1); - assertPositions("ca", 1, 5); - assertPositions("c", 1, 5); - - assertPositions("aaacatat", 2); - assertPositions("aaacata", 2); - assertPositions("aaacat", 2); - assertPositions("aaaca", 2); - assertPositions("aaac", 2); - assertPositions("aaa", 2); - assertPositions("aa", 2, 3); - - assertPositions("aacatat", 3); - assertPositions("aacata", 3); - assertPositions("aacat", 3); - assertPositions("aaca", 3); - assertPositions("aac", 3); - - assertPositions("acatat", 4); - assertPositions("acata", 4); - assertPositions("acat", 4); - - assertPositions("catat", 5); - assertPositions("cata", 5); - assertPositions("cat", 5); - - assertPositions("atat", 6); - assertPositions("ata", 6); - } - - @Test - public void testMississippi() { - document("mississippi"); - buildSearch(); - assertPositions("mississippi", 0); - assertPositions("ississippi", 1); - assertPositions("ssissippi", 2); - assertPositions("sissippi", 3); - assertPositions("issippi", 4); - assertPositions("ssippi", 5); - assertPositions("sippi", 6); - assertPositions("ippi", 7); - assertPositions("ppi", 8); - assertPositions("pi", 9); - assertPositions("i", 10, 7, 4, 1); - - assertPositions("mississippi", 0); - assertPositions("mississipp", 0); - assertPositions("mississip", 0); - assertPositions("mississi", 0); - assertPositions("mississ", 0); - assertPositions("missis", 0); - assertPositions("missi", 0); - assertPositions("miss", 0); - assertPositions("mis", 0); - assertPositions("mi", 0); - assertPositions("m", 0); - - assertPositions("ississippi", 1); - assertPositions("ississipp", 1); - assertPositions("ississip", 1); - assertPositions("ississi", 1); - assertPositions("ississ", 1); - assertPositions("issis", 1); - assertPositions("issi", 1, 4); - assertPositions("iss", 1, 4); - assertPositions("is", 1, 4); - - assertPositions("ssissippi", 2); - assertPositions("ssissipp", 2); - assertPositions("ssissip", 2); - assertPositions("ssissi", 2); - assertPositions("ssiss", 2); - assertPositions("ssis", 2); - assertPositions("ssi", 2, 5); - assertPositions("ss", 2, 5); - assertPositions("s", 2, 3, 5, 6); - - assertPositions("sissippi", 3); - assertPositions("sissipp", 3); - assertPositions("sissip", 3); - assertPositions("sissi", 3); - assertPositions("siss", 3); - assertPositions("sis", 3); - assertPositions("si", 3, 6); - - assertPositions("issippi", 4); - assertPositions("issipp", 4); - assertPositions("issip", 4); - - assertPositions("ssippi", 5); - assertPositions("ssipp", 5); - assertPositions("ssip", 5); - - assertPositions("sippi", 6); - assertPositions("sipp", 6); - assertPositions("sip", 6); - } - - @Test - public void testJoin() { - document("abcde"); - document("fghij"); - document("klmno"); - document("pqrst"); - buildSearch(); - assertPositions("defg"); - assertSingleHit("abc", 0, 0); - assertSingleHit("fgh", 1, 0); - assertSingleHit("klm", 2, 0); - assertSingleHit("pqr", 3, 0); - - assertSingleHit("bcd", 0, 1, 2, "a", "e"); - assertSingleHit("ghi", 1, 1, 1, "f", "j"); - assertSingleHit("lmn", 2, 1, 10, "k", "o"); - assertSingleHit("qrs", 3, 1, 100, "p", "t"); - - assertSingleHit("abcde", 0, 0); - assertSingleHit("fghij", 1, 0); - assertSingleHit("klmno", 2, 0); - assertSingleHit("pqrst", 3, 0); - - assertPositions("abcde", 0); - assertPositions("fghij", 0); - assertPositions("klmno", 0); - assertPositions("pqrst", 0); - } - - protected void assertLinesAbove(int maxLines, String expectedLinesAbove) { - srChecker.assertLinesBefore(maxLines, expectedLinesAbove); - } - - protected void assertLinesBelow(int maxLines, String expectedLinesBelow) { - srChecker.assertLinesAfter(maxLines, expectedLinesBelow); - } - - @Test - public void testLineContext() { - document("aaa\nbbb\nccc\nddd\neee"); - buildSearch(); - find("ccc"); - assertLinesAbove(0, ""); - assertLinesAbove(1, "bbb\n"); - assertLinesAbove(2, "aaa\nbbb\n"); - assertLinesAbove(3, "aaa\nbbb\n"); - assertLinesBelow(0, ""); - assertLinesBelow(1, "\nddd"); - assertLinesBelow(2, "\nddd\neee"); - assertLinesBelow(3, "\nddd\neee"); - } - - @Test - public void testLineContext2() { - document("aaa\nbbb\nccGGcc\nddd\neee"); - buildSearch(); - find("GG"); - assertLinesAbove(0, "cc"); - assertLinesAbove(1, "bbb\ncc"); - assertLinesAbove(2, "aaa\nbbb\ncc"); - assertLinesAbove(3, "aaa\nbbb\ncc"); - assertLinesBelow(0, "cc"); - assertLinesBelow(1, "cc\nddd"); - assertLinesBelow(2, "cc\nddd\neee"); - assertLinesBelow(3, "cc\nddd\neee"); - } - - @Test - public void testAaaa() { - document("aaaaaaaaaaaaaaaaaaaa"); - buildSearch(); - assertPositions("aaaa", 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); - } - -} diff --git a/server/src/test/java/sk/linhard/exactly/impl/TestUtil.java b/server/src/test/java/sk/linhard/exactly/impl/TestUtil.java deleted file mode 100644 index 8d4bc6e..0000000 --- a/server/src/test/java/sk/linhard/exactly/impl/TestUtil.java +++ /dev/null @@ -1,47 +0,0 @@ -package sk.linhard.exactly.impl; - -import java.io.UnsupportedEncodingException; -import java.util.Arrays; -import java.util.Random; - -import org.apache.commons.lang3.ArrayUtils; -import org.junit.Assert; - -public class TestUtil { - - public static byte[] join(byte[] a, byte[] b) { - byte[] c = new byte[a.length + b.length]; - System.arraycopy(a, 0, c, 0, a.length); - System.arraycopy(b, 0, c, a.length, b.length); - return c; - } - - public static byte[] bytes(int... a) { - byte[] b = new byte[a.length]; - for (int i = 0; i < b.length; i++) { - b[i] = (byte) a[i]; - } - return b; - } - - public static byte[] bytes(String s) { - try { - byte[] bytes = s.getBytes("UTF-8"); - Assert.assertEquals(s.length(), bytes.length); - return bytes; - } catch (UnsupportedEncodingException e) { - throw new RuntimeException(e); - } - } - - public static byte[] randomBytes(int n) { - Random r = new Random(); - byte[] rb = new byte[n]; - r.nextBytes(rb); - return rb; - } - - public static String print(byte[] b) { - return Arrays.asList(ArrayUtils.toObject(b)).toString(); - } -}