Skip to content
This repository has been archived by the owner on Apr 4, 2023. It is now read-only.

Commit

Permalink
Merge #364
Browse files Browse the repository at this point in the history
364: Fix all the benchmarks  r=Kerollmops a=irevoire

#324 broke all benchmarks.
I fixed everything and noticed that `cargo check --all` was insufficient to check the bench in multiple workspaces, so I also updated the CI to use `cargo check --workspace --all-targets`.

Co-authored-by: Tamo <[email protected]>
  • Loading branch information
bors[bot] and irevoire authored Sep 22, 2021
2 parents 16790ee + 176160d commit ad3befa
Show file tree
Hide file tree
Showing 5 changed files with 99 additions and 63 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/rust.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ jobs:
uses: actions-rs/cargo@v1
with:
command: check
args: --all
args: --workspace --all-targets
- name: Run cargo test
uses: actions-rs/cargo@v1
with:
Expand Down
3 changes: 3 additions & 0 deletions benchmarks/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ publish = false

[dependencies]
milli = { path = "../milli" }
anyhow = "1.0"
serde_json = { version = "1.0.62", features = ["preserve_order"] }
csv = "1.1.6"

[target.'cfg(target_os = "linux")'.dependencies]
jemallocator = "0.3.2"
Expand Down
80 changes: 27 additions & 53 deletions benchmarks/benches/indexing.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
mod datasets_paths;
mod utils;

use std::fs::{create_dir_all, remove_dir_all, File};
use std::fs::{create_dir_all, remove_dir_all};
use std::path::Path;

use criterion::{criterion_group, criterion_main, Criterion};
use heed::EnvOpenOptions;
use milli::update::{IndexDocumentsMethod, UpdateBuilder, UpdateFormat};
use milli::update::UpdateBuilder;
use milli::Index;

#[cfg(target_os = "linux")]
Expand Down Expand Up @@ -67,15 +68,10 @@ fn indexing_songs_default(c: &mut Criterion) {
move |index| {
let update_builder = UpdateBuilder::new(0);
let mut wtxn = index.write_txn().unwrap();
let mut builder = update_builder.index_documents(&mut wtxn, &index);

builder.update_format(UpdateFormat::Csv);
builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
let reader = File::open(datasets_paths::SMOL_SONGS).expect(&format!(
"could not find the dataset in: {}",
datasets_paths::SMOL_SONGS
));
builder.execute(reader, |_, _| ()).unwrap();
let builder = update_builder.index_documents(&mut wtxn, &index);

let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv");
builder.execute(documents, |_, _| ()).unwrap();
wtxn.commit().unwrap();

index.prepare_for_closing().wait();
Expand Down Expand Up @@ -118,15 +114,10 @@ fn indexing_songs_without_faceted_numbers(c: &mut Criterion) {
move |index| {
let update_builder = UpdateBuilder::new(0);
let mut wtxn = index.write_txn().unwrap();
let mut builder = update_builder.index_documents(&mut wtxn, &index);

builder.update_format(UpdateFormat::Csv);
builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
let reader = File::open(datasets_paths::SMOL_SONGS).expect(&format!(
"could not find the dataset in: {}",
datasets_paths::SMOL_SONGS
));
builder.execute(reader, |_, _| ()).unwrap();
let builder = update_builder.index_documents(&mut wtxn, &index);

let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv");
builder.execute(documents, |_, _| ()).unwrap();
wtxn.commit().unwrap();

index.prepare_for_closing().wait();
Expand Down Expand Up @@ -165,15 +156,10 @@ fn indexing_songs_without_faceted_fields(c: &mut Criterion) {
move |index| {
let update_builder = UpdateBuilder::new(0);
let mut wtxn = index.write_txn().unwrap();
let mut builder = update_builder.index_documents(&mut wtxn, &index);

builder.update_format(UpdateFormat::Csv);
builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
let reader = File::open(datasets_paths::SMOL_SONGS).expect(&format!(
"could not find the dataset in: {}",
datasets_paths::SMOL_SONGS
));
builder.execute(reader, |_, _| ()).unwrap();
let builder = update_builder.index_documents(&mut wtxn, &index);

let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv");
builder.execute(documents, |_, _| ()).unwrap();
wtxn.commit().unwrap();

index.prepare_for_closing().wait();
Expand Down Expand Up @@ -211,15 +197,10 @@ fn indexing_wiki(c: &mut Criterion) {
move |index| {
let update_builder = UpdateBuilder::new(0);
let mut wtxn = index.write_txn().unwrap();
let mut builder = update_builder.index_documents(&mut wtxn, &index);

builder.update_format(UpdateFormat::Csv);
builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
let reader = File::open(datasets_paths::SMOL_WIKI_ARTICLES).expect(&format!(
"could not find the dataset in: {}",
datasets_paths::SMOL_SONGS
));
builder.execute(reader, |_, _| ()).unwrap();
let builder = update_builder.index_documents(&mut wtxn, &index);

let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES, "csv");
builder.execute(documents, |_, _| ()).unwrap();
wtxn.commit().unwrap();

index.prepare_for_closing().wait();
Expand Down Expand Up @@ -262,13 +243,10 @@ fn indexing_movies_default(c: &mut Criterion) {
move |index| {
let update_builder = UpdateBuilder::new(0);
let mut wtxn = index.write_txn().unwrap();
let mut builder = update_builder.index_documents(&mut wtxn, &index);
let builder = update_builder.index_documents(&mut wtxn, &index);

builder.update_format(UpdateFormat::Json);
builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
let reader = File::open(datasets_paths::MOVIES)
.expect(&format!("could not find the dataset in: {}", datasets_paths::MOVIES));
builder.execute(reader, |_, _| ()).unwrap();
let documents = utils::documents_from(datasets_paths::MOVIES, "json");
builder.execute(documents, |_, _| ()).unwrap();
wtxn.commit().unwrap();

index.prepare_for_closing().wait();
Expand Down Expand Up @@ -316,15 +294,11 @@ fn indexing_geo(c: &mut Criterion) {
move |index| {
let update_builder = UpdateBuilder::new(0);
let mut wtxn = index.write_txn().unwrap();
let mut builder = update_builder.index_documents(&mut wtxn, &index);

builder.update_format(UpdateFormat::JsonStream);
builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
let reader = File::open(datasets_paths::SMOL_ALL_COUNTRIES).expect(&format!(
"could not find the dataset in: {}",
datasets_paths::SMOL_ALL_COUNTRIES
));
builder.execute(reader, |_, _| ()).unwrap();
let builder = update_builder.index_documents(&mut wtxn, &index);

let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "jsonl");
builder.execute(documents, |_, _| ()).unwrap();

wtxn.commit().unwrap();

index.prepare_for_closing().wait();
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/benches/search_geo.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ mod datasets_paths;
mod utils;

use criterion::{criterion_group, criterion_main};
use milli::update::{Settings, UpdateFormat};
use milli::update::Settings;
use utils::Conf;

#[cfg(target_os = "linux")]
Expand Down Expand Up @@ -33,7 +33,7 @@ fn base_conf(builder: &mut Settings) {
#[rustfmt::skip]
const BASE_CONF: Conf = Conf {
dataset: datasets_paths::SMOL_ALL_COUNTRIES,
dataset_format: UpdateFormat::JsonStream,
dataset_format: "jsonl",
queries: &[
"",
],
Expand Down
73 changes: 66 additions & 7 deletions benchmarks/benches/utils.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,15 @@
#![allow(dead_code)]

use std::fs::{create_dir_all, remove_dir_all, File};
use std::io::{self, Cursor, Read, Seek};
use std::path::Path;

use criterion::BenchmarkId;
use heed::EnvOpenOptions;
use milli::update::{IndexDocumentsMethod, Settings, UpdateBuilder, UpdateFormat};
use milli::documents::DocumentBatchReader;
use milli::update::{IndexDocumentsMethod, Settings, UpdateBuilder};
use milli::{FilterCondition, Index};
use serde_json::{Map, Value};

pub struct Conf<'a> {
/// where we are going to create our database.mmdb directory
Expand All @@ -13,7 +18,7 @@ pub struct Conf<'a> {
/// the dataset to be used, it must be an uncompressed csv
pub dataset: &'a str,
/// The format of the dataset
pub dataset_format: UpdateFormat,
pub dataset_format: &'a str,
pub group_name: &'a str,
pub queries: &'a [&'a str],
/// here you can change which criterion are used and in which order.
Expand All @@ -33,7 +38,7 @@ pub struct Conf<'a> {
impl Conf<'_> {
pub const BASE: Self = Conf {
database_name: "benches.mmdb",
dataset_format: UpdateFormat::Csv,
dataset_format: "csv",
dataset: "",
group_name: "",
queries: &[],
Expand Down Expand Up @@ -87,11 +92,10 @@ pub fn base_setup(conf: &Conf) -> Index {
if let None = conf.primary_key {
builder.enable_autogenerate_docids();
}
builder.update_format(conf.dataset_format);
let documents = documents_from(conf.dataset, conf.dataset_format);

builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
let reader = File::open(conf.dataset)
.expect(&format!("could not find the dataset in: {}", conf.dataset));
builder.execute(reader, |_, _| ()).unwrap();
builder.execute(documents, |_, _| ()).unwrap();
wtxn.commit().unwrap();

index
Expand Down Expand Up @@ -128,3 +132,58 @@ pub fn run_benches(c: &mut criterion::Criterion, confs: &[Conf]) {
index.prepare_for_closing().wait();
}
}

pub fn documents_from(filename: &str, filetype: &str) -> DocumentBatchReader<impl Read + Seek> {
let reader =
File::open(filename).expect(&format!("could not find the dataset in: {}", filename));
let documents = match filetype {
"csv" => documents_from_csv(reader).unwrap(),
"json" => documents_from_json(reader).unwrap(),
"jsonl" => documents_from_jsonl(reader).unwrap(),
otherwise => panic!("invalid update format {:?}", otherwise),
};
DocumentBatchReader::from_reader(Cursor::new(documents)).unwrap()
}

fn documents_from_jsonl(reader: impl io::Read) -> anyhow::Result<Vec<u8>> {
let mut writer = Cursor::new(Vec::new());
let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;

let values = serde_json::Deserializer::from_reader(reader)
.into_iter::<serde_json::Map<String, serde_json::Value>>();
for document in values {
let document = document?;
documents.add_documents(document)?;
}
documents.finish()?;

Ok(writer.into_inner())
}

fn documents_from_json(reader: impl io::Read) -> anyhow::Result<Vec<u8>> {
let mut writer = Cursor::new(Vec::new());
let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;

let json: serde_json::Value = serde_json::from_reader(reader)?;
documents.add_documents(json)?;
documents.finish()?;

Ok(writer.into_inner())
}

fn documents_from_csv(reader: impl io::Read) -> anyhow::Result<Vec<u8>> {
let mut writer = Cursor::new(Vec::new());
let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;

let mut records = csv::Reader::from_reader(reader);
let iter = records.deserialize::<Map<String, Value>>();

for doc in iter {
let doc = doc?;
documents.add_documents(doc)?;
}

documents.finish()?;

Ok(writer.into_inner())
}

0 comments on commit ad3befa

Please sign in to comment.