-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit f5faa16
Showing
59 changed files
with
11,353 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
DATABASE_URL="postgres://db_user:db_pass@localhost:5432/record_database" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
name: Rust | ||
|
||
on: | ||
push: | ||
branches: [ main ] | ||
pull_request: | ||
branches: [ main ] | ||
|
||
env: | ||
CARGO_TERM_COLOR: always | ||
|
||
jobs: | ||
build: | ||
name: Build | ||
runs-on: ubuntu-latest | ||
|
||
steps: | ||
- name: Checkout | ||
uses: actions/checkout@v2 | ||
|
||
- name: Setup test database | ||
run: | | ||
cd docker && docker-compose up -d && cd .. | ||
docker ps -a | ||
- name: Build | ||
run: cargo build --verbose | ||
|
||
- name: Run tests | ||
run: cargo test --verbose | ||
|
||
- name: Run doc tests | ||
run: cargo test --doc --verbose | ||
|
||
test: | ||
name: Coverage | ||
runs-on: ubuntu-latest | ||
steps: | ||
- name: Checkout repository | ||
uses: actions/checkout@v2 | ||
|
||
- name: Setup test database | ||
run: | | ||
cd docker && docker-compose up -d && cd .. | ||
docker ps -a | ||
- name: Setup | ||
run: | | ||
cargo install cargo-tarpaulin | ||
- name: Install Nightly | ||
run: | | ||
rustup toolchain install nightly | ||
- name: Generate code coverage | ||
run: | | ||
cargo tarpaulin --verbose --all-features --workspace --timeout 120 --run-types Tests --run-types Doctests --out Xml | ||
- name: Coverage upload | ||
uses: codecov/codecov-action@v1 | ||
with: | ||
token: ${{secrets.CODECOV_TOKEN}} | ||
fail_ci_if_error: true |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
/target | ||
Cargo.lock | ||
.idea/ | ||
cobertura.xml | ||
ui/dist | ||
ui/node_modules |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
[package] | ||
name = "warcse" | ||
version = "0.1.0" | ||
authors = ["Stavros Grigoriou <[email protected]>"] | ||
edition = "2021" | ||
|
||
[lib] | ||
path = "src/lib.rs" | ||
|
||
[[bin]] | ||
path = "src/main.rs" | ||
name = "warcse" | ||
|
||
[dependencies] | ||
actix-web = "4.0.0" | ||
tokio = { version = "1", features = ["macros", "rt-multi-thread"] } | ||
actix-files = "0.6.2" | ||
actix-cors = "0.6.5" | ||
config = { version = "0.10.1", default-features = false, features = ["yaml"] } | ||
sqlx = { version = "0.7.0", default-features = false, features = ["runtime-tokio-rustls", "macros", "postgres", "uuid", "chrono", "migrate"] } | ||
serde = { version = "1.0.115", features = ["derive"] } | ||
log = "0.4.10" | ||
simple_logger = "4.3.3" | ||
serde_json = { version = "1.0.0" } | ||
json = "0.12" | ||
convert_case = "0.6.0" | ||
actix-web-lab = "0.20.1" | ||
|
||
[dev-dependencies] | ||
reqwest = { version = "0.11.23", features = ["json"] } | ||
tokio = "1.0.1" | ||
actix-rt = "2.0.0" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
MIT License | ||
|
||
Copyright (c) 2021 Stavros Grigoriou | ||
|
||
Permission is hereby granted, free of charge, to any person obtaining a copy | ||
of this software and associated documentation files (the "Software"), to deal | ||
in the Software without restriction, including without limitation the rights | ||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
copies of the Software, and to permit persons to whom the Software is | ||
furnished to do so, subject to the following conditions: | ||
|
||
The above copyright notice and this permission notice shall be included in all | ||
copies or substantial portions of the Software. | ||
|
||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
SOFTWARE. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
### warc-search-engine | ||
|
||
![Build](https://github.com/stav121/warc-se/workflows/Rust/badge.svg?branch=project-init) | ||
[![codecov](https://codecov.io/gh/stav121/warc-se/branch/main/graph/badge.svg?token=ld4QX2stjM)](https://codecov.io/gh/stav121/warc-se) | ||
![Last commit](https://img.shields.io/github/last-commit/stav121/warc-se/main) | ||
![Licence](https://img.shields.io/github/license/stav121/warc-se) | ||
|
||
A simple Search Engine for WARC/0.18 parsed files, written in Rust | ||
|
||
---- | ||
### TODO | ||
|
||
* Cleanup | ||
* Fix hardcoded paths | ||
* Upgrade code |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
set -xo | ||
|
||
cd ui/ | ||
ng build --prod --deploy-url /static/ --output-path dist/ | ||
cd ../ | ||
cargo build |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
application_port: 8000 | ||
database: | ||
host: "localhost" | ||
port: 5432 | ||
username: "db_user" | ||
password: "db_pass" | ||
database_name: "record_database" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
POSTGRES_USER=db_user | ||
POSTGRES_PASSWORD=db_pass | ||
POSTGRES_DB=record_database |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
version: '3' | ||
|
||
services: | ||
postgres: | ||
image: "postgres" | ||
env_file: | ||
- database.env | ||
ports: | ||
- "5432:5432" | ||
volumes: | ||
- ./init.sql:/docker-entrypoint-initdb.d/1-init.sql | ||
|
||
volumes: | ||
database-data: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,119 @@ | ||
----------------------- | ||
-- Table: CORPUS_INFO | ||
----------------------- | ||
CREATE TABLE corpus_info | ||
( | ||
id SERIAL, | ||
PRIMARY KEY (id), | ||
name TEXT NOT NULL UNIQUE, | ||
total_records INT DEFAULT 0 | ||
); | ||
|
||
COMMENT ON TABLE corpus_info IS 'The general corpus metadata.'; | ||
COMMENT ON COLUMN corpus_info.id IS 'The corpus id (auto-generated).'; | ||
COMMENT ON COLUMN corpus_info.name IS 'The corpus name (unique).'; | ||
COMMENT ON COLUMN corpus_info.total_records IS 'The corpus total record count.'; | ||
|
||
----------------------- | ||
-- Table: CORPUS_FILES | ||
----------------------- | ||
CREATE TABLE corpus_files | ||
( | ||
id SERIAL, | ||
PRIMARY KEY (id), | ||
name TEXT NOT NULL UNIQUE, | ||
corpus_id INT NOT NULL REFERENCES corpus_info (id) | ||
); | ||
|
||
COMMENT ON TABLE corpus_files IS 'The files contained in each corpus.'; | ||
COMMENT ON COLUMN corpus_files.id IS 'The file id.'; | ||
COMMENT ON COLUMN corpus_files.name IS 'The file name (unique).'; | ||
COMMENT ON COLUMN corpus_files.corpus_id IS 'The corpus the file belongs to.'; | ||
|
||
----------------------- | ||
-- Table: RECORD_INDEX | ||
----------------------- | ||
CREATE TABLE record_index | ||
( | ||
id SERIAL, | ||
PRIMARY KEY (id), | ||
trec_id TEXT NOT NULL, | ||
uri TEXT NOT NULL, | ||
version TEXT NOT NULL, | ||
analyzed BOOLEAN NOT NULL, | ||
total_words INT DEFAULT 0, | ||
corpus_id INT NOT NULL REFERENCES corpus_info (id) | ||
); | ||
|
||
COMMENT ON TABLE record_index IS 'Record database.'; | ||
COMMENT ON COLUMN record_index.id IS 'A auto generated entry id.'; | ||
COMMENT ON COLUMN record_index.trec_id IS 'The TREC id of the document.'; | ||
COMMENT ON COLUMN record_index.uri IS 'The URL linked to the record.'; | ||
COMMENT ON COLUMN record_index.version IS 'The WARC version from the parsed document'; | ||
COMMENT ON COLUMN record_index.analyzed IS 'The record has been analyzed'; | ||
COMMENT ON COLUMN record_index.total_words IS 'The total words in the record (including meta).'; | ||
COMMENT ON COLUMN record_index.corpus_id IS 'The corpus this record belongs to.'; | ||
|
||
---------------------- | ||
-- Table: RECORD_META | ||
---------------------- | ||
CREATE TABLE record_meta | ||
( | ||
record INT NOT NULL REFERENCES record_index (id), | ||
meta TEXT NOT NULL | ||
); | ||
|
||
COMMENT ON TABLE record_meta IS 'The metadata linked to a record.'; | ||
COMMENT ON COLUMN record_meta.record IS 'Foreign key to the linked record'; | ||
COMMENT ON COLUMN record_meta.meta IS 'The metadata'; | ||
|
||
---------------------- | ||
-- Table: WORD_INDEX | ||
---------------------- | ||
CREATE TABLE word_index | ||
( | ||
word TEXT NOT NULL UNIQUE, | ||
PRIMARY KEY (word), | ||
total_appearances INT NOT NULL DEFAULT 0, | ||
frequency DOUBLE PRECISION NOT NULL DEFAULT 0.0 | ||
); | ||
|
||
COMMENT ON TABLE word_index IS 'Index of all found words.'; | ||
COMMENT ON COLUMN word_index.word IS 'The parsed word.'; | ||
COMMENT ON COLUMN word_index.total_appearances IS 'The total appearances of the word in all documents.'; | ||
COMMENT ON COLUMN word_index.frequency IS 'The frequency of the word in all documents.'; | ||
|
||
---------------------------- | ||
-- Table: WORD_RECORD_INDEX | ||
---------------------------- | ||
CREATE TABLE word_record_index | ||
( | ||
word TEXT NOT NULL REFERENCES word_index (word), | ||
record INT NOT NULL REFERENCES record_index (id), | ||
appearances INT NOT NULL DEFAULT 0, | ||
tf DOUBLE PRECISION NOT NULL DEFAULT 0.0, | ||
idf DOUBLE PRECISION NOT NULL DEFAULT 0.0 | ||
); | ||
|
||
COMMENT ON TABLE word_record_index IS 'The words linked to a record.'; | ||
COMMENT ON COLUMN word_record_index.word IS 'The word text.'; | ||
COMMENT ON COLUMN word_record_index.record IS 'The owner record.'; | ||
COMMENT ON COLUMN word_record_index.appearances IS 'Appearances of the word in the record.'; | ||
|
||
---------------------------- | ||
-- Table: WORD_CORPUS_INDEX | ||
---------------------------- | ||
CREATE TABLE word_corpus_index | ||
( | ||
word TEXT NOT NULL REFERENCES word_index (word), | ||
corpus INT NOT NULL REFERENCES corpus_info (id), | ||
appearances INT NOT NULL DEFAULT 0, | ||
tf DOUBLE PRECISION NOT NULL DEFAULT 0.0, | ||
idf DOUBLE PRECISION NOT NULL DEFAULT 0.0 | ||
); | ||
|
||
COMMENT ON TABLE word_corpus_index IS 'The words linked to a corpus.'; | ||
COMMENT ON COLUMN word_corpus_index.corpus IS 'The linked corpus.'; | ||
COMMENT ON COLUMN word_corpus_index.appearances IS 'Total appearances of the word in the corpus.'; | ||
COMMENT ON COLUMN word_corpus_index.tf IS 'Term frequency in the corpus.'; | ||
COMMENT ON COLUMN word_corpus_index.idf IS 'IDF of the word in the corpus.'; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
/// Structure that contains the Settings information. | ||
#[derive(serde::Deserialize)] | ||
pub struct Settings { | ||
pub database: DatabaseSettings, | ||
pub application_port: u16, | ||
} | ||
|
||
/// Structure that contains the database information. | ||
#[derive(serde::Deserialize)] | ||
pub struct DatabaseSettings { | ||
pub username: String, | ||
pub password: String, | ||
pub port: u16, | ||
pub host: String, | ||
pub database_name: String, | ||
} | ||
|
||
/// Database settings initializer. | ||
impl DatabaseSettings { | ||
/// Returns a full connection string with the database suffix. | ||
pub fn connection_string(&self) -> String { | ||
format!( | ||
"postgres://{}:{}@{}:{}/{}", | ||
self.username, self.password, self.host, self.port, self.database_name | ||
) | ||
} | ||
|
||
/// Returns a connection string without the database suffix. | ||
pub fn connection_string_without_db(&self) -> String { | ||
format!( | ||
"postgres://{}:{}@{}:{}", | ||
self.username, self.password, self.host, self.port | ||
) | ||
} | ||
} | ||
|
||
/// Fetch the configuration of the application. | ||
pub fn get_configuration() -> Result<Settings, config::ConfigError> { | ||
let mut settings = config::Config::default(); | ||
|
||
settings.merge(config::File::with_name("configuration"))?; | ||
|
||
settings.try_into() | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
/// This mod contains the structs used for returning results. | ||
mod response; | ||
|
||
pub use response::*; |
Oops, something went wrong.