Skip to content

Commit

Permalink
feat(core): project init
Browse files Browse the repository at this point in the history
  • Loading branch information
stav121 committed Jan 8, 2024
0 parents commit f5faa16
Show file tree
Hide file tree
Showing 59 changed files with 11,353 additions and 0 deletions.
1 change: 1 addition & 0 deletions .env
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
DATABASE_URL="postgres://db_user:db_pass@localhost:5432/record_database"
60 changes: 60 additions & 0 deletions .github/workflows/rust.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
name: Rust

on:
push:
branches: [ main ]
pull_request:
branches: [ main ]

env:
CARGO_TERM_COLOR: always

jobs:
build:
name: Build
runs-on: ubuntu-latest

steps:
- name: Checkout
uses: actions/checkout@v2

- name: Setup test database
run: |
cd docker && docker-compose up -d && cd ..
docker ps -a
- name: Build
run: cargo build --verbose

- name: Run tests
run: cargo test --verbose

- name: Run doc tests
run: cargo test --doc --verbose

test:
name: Coverage
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v2

- name: Setup test database
run: |
cd docker && docker-compose up -d && cd ..
docker ps -a
- name: Setup
run: |
cargo install cargo-tarpaulin
- name: Install Nightly
run: |
rustup toolchain install nightly
- name: Generate code coverage
run: |
cargo tarpaulin --verbose --all-features --workspace --timeout 120 --run-types Tests --run-types Doctests --out Xml
- name: Coverage upload
uses: codecov/codecov-action@v1
with:
token: ${{secrets.CODECOV_TOKEN}}
fail_ci_if_error: true
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
/target
Cargo.lock
.idea/
cobertura.xml
ui/dist
ui/node_modules
32 changes: 32 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
[package]
name = "warcse"
version = "0.1.0"
authors = ["Stavros Grigoriou <[email protected]>"]
edition = "2021"

[lib]
path = "src/lib.rs"

[[bin]]
path = "src/main.rs"
name = "warcse"

[dependencies]
actix-web = "4.0.0"
tokio = { version = "1", features = ["macros", "rt-multi-thread"] }
actix-files = "0.6.2"
actix-cors = "0.6.5"
config = { version = "0.10.1", default-features = false, features = ["yaml"] }
sqlx = { version = "0.7.0", default-features = false, features = ["runtime-tokio-rustls", "macros", "postgres", "uuid", "chrono", "migrate"] }
serde = { version = "1.0.115", features = ["derive"] }
log = "0.4.10"
simple_logger = "4.3.3"
serde_json = { version = "1.0.0" }
json = "0.12"
convert_case = "0.6.0"
actix-web-lab = "0.20.1"

[dev-dependencies]
reqwest = { version = "0.11.23", features = ["json"] }
tokio = "1.0.1"
actix-rt = "2.0.0"
21 changes: 21 additions & 0 deletions LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
MIT License

Copyright (c) 2021 Stavros Grigoriou

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
15 changes: 15 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
### warc-search-engine

![Build](https://github.com/stav121/warc-se/workflows/Rust/badge.svg?branch=project-init)
[![codecov](https://codecov.io/gh/stav121/warc-se/branch/main/graph/badge.svg?token=ld4QX2stjM)](https://codecov.io/gh/stav121/warc-se)
![Last commit](https://img.shields.io/github/last-commit/stav121/warc-se/main)
![Licence](https://img.shields.io/github/license/stav121/warc-se)

A simple Search Engine for WARC/0.18 parsed files, written in Rust

----
### TODO

* Cleanup
* Fix hardcoded paths
* Upgrade code
6 changes: 6 additions & 0 deletions build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
set -xo

cd ui/
ng build --prod --deploy-url /static/ --output-path dist/
cd ../
cargo build
7 changes: 7 additions & 0 deletions configuration.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
application_port: 8000
database:
host: "localhost"
port: 5432
username: "db_user"
password: "db_pass"
database_name: "record_database"
3 changes: 3 additions & 0 deletions docker/database.env
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
POSTGRES_USER=db_user
POSTGRES_PASSWORD=db_pass
POSTGRES_DB=record_database
14 changes: 14 additions & 0 deletions docker/docker-compose.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
version: '3'

services:
postgres:
image: "postgres"
env_file:
- database.env
ports:
- "5432:5432"
volumes:
- ./init.sql:/docker-entrypoint-initdb.d/1-init.sql

volumes:
database-data:
119 changes: 119 additions & 0 deletions docker/init.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
-----------------------
-- Table: CORPUS_INFO
-----------------------
CREATE TABLE corpus_info
(
id SERIAL,
PRIMARY KEY (id),
name TEXT NOT NULL UNIQUE,
total_records INT DEFAULT 0
);

COMMENT ON TABLE corpus_info IS 'The general corpus metadata.';
COMMENT ON COLUMN corpus_info.id IS 'The corpus id (auto-generated).';
COMMENT ON COLUMN corpus_info.name IS 'The corpus name (unique).';
COMMENT ON COLUMN corpus_info.total_records IS 'The corpus total record count.';

-----------------------
-- Table: CORPUS_FILES
-----------------------
CREATE TABLE corpus_files
(
id SERIAL,
PRIMARY KEY (id),
name TEXT NOT NULL UNIQUE,
corpus_id INT NOT NULL REFERENCES corpus_info (id)
);

COMMENT ON TABLE corpus_files IS 'The files contained in each corpus.';
COMMENT ON COLUMN corpus_files.id IS 'The file id.';
COMMENT ON COLUMN corpus_files.name IS 'The file name (unique).';
COMMENT ON COLUMN corpus_files.corpus_id IS 'The corpus the file belongs to.';

-----------------------
-- Table: RECORD_INDEX
-----------------------
CREATE TABLE record_index
(
id SERIAL,
PRIMARY KEY (id),
trec_id TEXT NOT NULL,
uri TEXT NOT NULL,
version TEXT NOT NULL,
analyzed BOOLEAN NOT NULL,
total_words INT DEFAULT 0,
corpus_id INT NOT NULL REFERENCES corpus_info (id)
);

COMMENT ON TABLE record_index IS 'Record database.';
COMMENT ON COLUMN record_index.id IS 'A auto generated entry id.';
COMMENT ON COLUMN record_index.trec_id IS 'The TREC id of the document.';
COMMENT ON COLUMN record_index.uri IS 'The URL linked to the record.';
COMMENT ON COLUMN record_index.version IS 'The WARC version from the parsed document';
COMMENT ON COLUMN record_index.analyzed IS 'The record has been analyzed';
COMMENT ON COLUMN record_index.total_words IS 'The total words in the record (including meta).';
COMMENT ON COLUMN record_index.corpus_id IS 'The corpus this record belongs to.';

----------------------
-- Table: RECORD_META
----------------------
CREATE TABLE record_meta
(
record INT NOT NULL REFERENCES record_index (id),
meta TEXT NOT NULL
);

COMMENT ON TABLE record_meta IS 'The metadata linked to a record.';
COMMENT ON COLUMN record_meta.record IS 'Foreign key to the linked record';
COMMENT ON COLUMN record_meta.meta IS 'The metadata';

----------------------
-- Table: WORD_INDEX
----------------------
CREATE TABLE word_index
(
word TEXT NOT NULL UNIQUE,
PRIMARY KEY (word),
total_appearances INT NOT NULL DEFAULT 0,
frequency DOUBLE PRECISION NOT NULL DEFAULT 0.0
);

COMMENT ON TABLE word_index IS 'Index of all found words.';
COMMENT ON COLUMN word_index.word IS 'The parsed word.';
COMMENT ON COLUMN word_index.total_appearances IS 'The total appearances of the word in all documents.';
COMMENT ON COLUMN word_index.frequency IS 'The frequency of the word in all documents.';

----------------------------
-- Table: WORD_RECORD_INDEX
----------------------------
CREATE TABLE word_record_index
(
word TEXT NOT NULL REFERENCES word_index (word),
record INT NOT NULL REFERENCES record_index (id),
appearances INT NOT NULL DEFAULT 0,
tf DOUBLE PRECISION NOT NULL DEFAULT 0.0,
idf DOUBLE PRECISION NOT NULL DEFAULT 0.0
);

COMMENT ON TABLE word_record_index IS 'The words linked to a record.';
COMMENT ON COLUMN word_record_index.word IS 'The word text.';
COMMENT ON COLUMN word_record_index.record IS 'The owner record.';
COMMENT ON COLUMN word_record_index.appearances IS 'Appearances of the word in the record.';

----------------------------
-- Table: WORD_CORPUS_INDEX
----------------------------
CREATE TABLE word_corpus_index
(
word TEXT NOT NULL REFERENCES word_index (word),
corpus INT NOT NULL REFERENCES corpus_info (id),
appearances INT NOT NULL DEFAULT 0,
tf DOUBLE PRECISION NOT NULL DEFAULT 0.0,
idf DOUBLE PRECISION NOT NULL DEFAULT 0.0
);

COMMENT ON TABLE word_corpus_index IS 'The words linked to a corpus.';
COMMENT ON COLUMN word_corpus_index.corpus IS 'The linked corpus.';
COMMENT ON COLUMN word_corpus_index.appearances IS 'Total appearances of the word in the corpus.';
COMMENT ON COLUMN word_corpus_index.tf IS 'Term frequency in the corpus.';
COMMENT ON COLUMN word_corpus_index.idf IS 'IDF of the word in the corpus.';
44 changes: 44 additions & 0 deletions src/configuration.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
/// Structure that contains the Settings information.
#[derive(serde::Deserialize)]
pub struct Settings {
pub database: DatabaseSettings,
pub application_port: u16,
}

/// Structure that contains the database information.
#[derive(serde::Deserialize)]
pub struct DatabaseSettings {
pub username: String,
pub password: String,
pub port: u16,
pub host: String,
pub database_name: String,
}

/// Database settings initializer.
impl DatabaseSettings {
/// Returns a full connection string with the database suffix.
pub fn connection_string(&self) -> String {
format!(
"postgres://{}:{}@{}:{}/{}",
self.username, self.password, self.host, self.port, self.database_name
)
}

/// Returns a connection string without the database suffix.
pub fn connection_string_without_db(&self) -> String {
format!(
"postgres://{}:{}@{}:{}",
self.username, self.password, self.host, self.port
)
}
}

/// Fetch the configuration of the application.
pub fn get_configuration() -> Result<Settings, config::ConfigError> {
let mut settings = config::Config::default();

settings.merge(config::File::with_name("configuration"))?;

settings.try_into()
}
4 changes: 4 additions & 0 deletions src/domain/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
/// This mod contains the structs used for returning results.
mod response;

pub use response::*;
Loading

0 comments on commit f5faa16

Please sign in to comment.