Evaluation of fitted prices

obsidiandynamics · Oct 31, 2023 · 774b170 · 774b170
1 parent 006eb66
commit 774b170
Show file tree

Hide file tree

Showing 13 changed files with 412 additions and 362 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -16,7 +16,7 @@ exclude = ["/images", "/bin", "/.idea", "/.github", "/coverage", "/doc", "/examp
 anyhow = "1.0.75"
 chrono = "0.4.31"
 clap = { version =  "4.4.6", features = ["derive"] }
-racing_scraper = "0.0.7"
+racing_scraper = "0.0.8"
 serde_json = "1.0.107"
 stanza = "0.3.0"
 tinyrand = "0.5.0"

diff --git a/README.md b/README.md
@@ -13,17 +13,20 @@ Circa 15M simulations/sec of a top-4 podium over 14 runners using the [tinyrand]
 Sourced from `examples/multi.rs`. To try this example, run `just multi` on the command line. You'll need [just](https://github.com/casey/just) installed.
 
 ```rust
+use std::error::Error;
+use std::path::PathBuf;
+
+use stanza::renderer::console::Console;
+use stanza::renderer::Renderer;
+
 use brumby::display::DisplaySlice;
 use brumby::file::ReadJsonFile;
 use brumby::market::{Market, OverroundMethod};
-use brumby::model::cf::Coefficients;
 use brumby::model::{Calibrator, Config, WinPlace};
+use brumby::model::cf::Coefficients;
+use brumby::model::fit::FitOptions;
 use brumby::print;
 use brumby::selection::{Rank, Runner};
-use stanza::renderer::console::Console;
-use stanza::renderer::Renderer;
-use std::error::Error;
-use std::path::PathBuf;
 
 fn main() -> Result<(), Box<dyn Error>> {
     // prices taken from a popular website
@@ -50,15 +53,15 @@ fn main() -> Result<(), Box<dyn Error>> {
         28.0,
     ];
 
-    // load coefficients from a file and create a calibrator
+    // load coefficients from a file and create a calibrator for model fitting
     let coefficients = Coefficients::read_json_file(PathBuf::from("config/thoroughbred.cf.json"))?;
     let config = Config {
         coefficients,
-        fit_options: Default::default(),
+        fit_options: FitOptions::fast()
     };
     let calibrator = Calibrator::try_from(config)?;
 
-    // fit Win and Place probabilities from the supplied prices, undoing the effect of the overrounds
+    // fit Win and Place probabilities from the supplied prices, undoing the overrounds
     let wp_markets = WinPlace {
         win: Market::fit(&OverroundMethod::Multiplicative, win_prices, 1.),
         place: Market::fit(&OverroundMethod::Multiplicative, place_prices, 3.),
@@ -70,11 +73,11 @@ fn main() -> Result<(), Box<dyn Error>> {
 
     // fit a model using the Win/Place prices and extrapolated overrounds
     let model = calibrator.fit(wp_markets, &overrounds)?.value;
-
-    // nicely format the derived prices
+    
+    // nicely format the derived price matrix
     let table = print::tabulate_derived_prices(&model.top_n.as_price_matrix());
     println!("\n{}", Console::default().render(&table));
-
+    
     // simulate a same-race multi for a chosen selection vector using the previously fitted model
     let selections = vec![
         Runner::number(6).top(Rank::number(1)),

diff --git a/examples/multi.rs b/examples/multi.rs
@@ -1,14 +1,17 @@
+use std::error::Error;
+use std::path::PathBuf;
+
+use stanza::renderer::console::Console;
+use stanza::renderer::Renderer;
+
 use brumby::display::DisplaySlice;
 use brumby::file::ReadJsonFile;
 use brumby::market::{Market, OverroundMethod};
-use brumby::model::cf::Coefficients;
 use brumby::model::{Calibrator, Config, WinPlace};
+use brumby::model::cf::Coefficients;
+use brumby::model::fit::FitOptions;
 use brumby::print;
 use brumby::selection::{Rank, Runner};
-use stanza::renderer::console::Console;
-use stanza::renderer::Renderer;
-use std::error::Error;
-use std::path::PathBuf;
 
 fn main() -> Result<(), Box<dyn Error>> {
     // probs taken from a popular website
@@ -39,7 +42,7 @@ fn main() -> Result<(), Box<dyn Error>> {
     let coefficients = Coefficients::read_json_file(PathBuf::from("config/thoroughbred.cf.json"))?;
     let config = Config {
         coefficients,
-        fit_options: Default::default(),
+        fit_options: FitOptions::fast(),
     };
     let calibrator = Calibrator::try_from(config)?;
 

diff --git a/justfile b/justfile
@@ -13,6 +13,10 @@ datadump *ARGS:
 backfit *ARGS:
     cargo run --release --bin backfit -- {{ARGS}}
 
+# evaluate the fitted model against a given dataset
+evaluate *ARGS:
+    cargo run --release --bin evaluate -- {{ARGS}}
+
 # run the multi example
 multi:
     cargo run --example multi --release

diff --git a/src/bin/backfit.rs b/src/bin/backfit.rs
@@ -53,10 +53,7 @@ fn main() -> Result<(), Box<dyn Error>> {
     args.validate()?;
     debug!("args: {args:?}");
 
-    let regressors_file = args
-        .regressors
-        .unwrap_or_else(|| PathBuf::from("../../config/greyhound.r.json"));
-    let regressors = Regressors::read_json_file(regressors_file)?;
+    let regressors = Regressors::read_json_file(args.regressors.unwrap())?;
     regressors.validate()?;
     debug!("regressors:\n{regressors:#?}");
 

diff --git a/src/bin/datadump.rs b/src/bin/datadump.rs
@@ -66,8 +66,8 @@ fn main() -> Result<(), Box<dyn Error>> {
     if let Some(race_type) = args.race_type {
         predicates.push(data::Predicate::Type { race_type });
     }
-    let races = data::read_from_dir(args.dir.unwrap(), PredicateClosures::from(predicates))?;
-    let races: Vec<_> = races.into_iter().map(EventDetailExt::summarise).collect();
+    let race_files = data::read_from_dir(args.dir.unwrap(), PredicateClosures::from(predicates))?;
+    let races: Vec<_> = race_files.into_iter().map(|race_file| race_file.race).map(EventDetailExt::summarise).collect();
 
     for (index, race) in races.iter().enumerate() {
         debug!("fitting race: {race:?} ({} of {})", index + 1, races.len());
@@ -99,8 +99,8 @@ fn main() -> Result<(), Box<dyn Error>> {
             }
         }
     }
-    let elapsed_time = start_time.elapsed();
-    info!("fitted {} races in {}s", races.len(), elapsed_time.as_millis() as f64 / 1_000.);
+    let elapsed = start_time.elapsed();
+    info!("fitted {} races in {}s", races.len(), elapsed.as_millis() as f64 / 1_000.);
 
     Ok(())
 }
diff --git a/src/bin/evaluate.rs b/src/bin/evaluate.rs
@@ -0,0 +1,227 @@
+use std::collections::HashMap;
+use std::env;
+use std::error::Error;
+use std::path::PathBuf;
+use std::time::Instant;
+
+use anyhow::anyhow;
+use clap::Parser;
+use racing_scraper::models::EventType;
+use stanza::renderer::console::Console;
+use stanza::renderer::Renderer;
+use stanza::style::{HAlign, Header, MinWidth, Styles};
+use stanza::table::{Cell, Col, Row, Table};
+use tracing::{debug, info};
+
+use brumby::data;
+use brumby::data::{EventDetailExt, PredicateClosures, RaceSummary};
+use brumby::file::ReadJsonFile;
+use brumby::market::{Market, OverroundMethod};
+use brumby::model::{Calibrator, Config, fit, TopN, WinPlace};
+use brumby::model::cf::Coefficients;
+
+const OVERROUND_METHOD: OverroundMethod = OverroundMethod::Multiplicative;
+const TOP_SUBSET: usize = 25;
+
+#[derive(Debug, clap::Parser, Clone)]
+struct Args {
+    /// directory to source the race data from
+    dir: Option<PathBuf>,
+
+    /// race type
+    #[clap(short = 'r', long, value_parser = parse_race_type)]
+    race_type: Option<EventType>,
+}
+impl Args {
+    fn validate(&self) -> anyhow::Result<()> {
+        self.dir
+            .as_ref()
+            .ok_or(anyhow!("data directory must be specified"))?;
+        Ok(())
+    }
+}
+fn parse_race_type(s: &str) -> anyhow::Result<EventType> {
+    match s.to_lowercase().as_str() {
+        "t" | "thoroughbred" => Ok(EventType::Thoroughbred),
+        "g" | "greyhound" => Ok(EventType::Greyhound),
+        _ => Err(anyhow!("unsupported race type {s}")),
+    }
+}
+
+fn main() -> Result<(), Box<dyn Error>> {
+    if env::var("RUST_BACKTRACE").is_err() {
+        env::set_var("RUST_BACKTRACE", "full")
+    }
+    if env::var("RUST_LOG").is_err() {
+        env::set_var("RUST_LOG", "info")
+    }
+    tracing_subscriber::fmt::init();
+
+    let args = Args::parse();
+    args.validate()?;
+    debug!("args: {args:?}");
+
+    let start_time = Instant::now();
+    let mut predicates = vec![];
+    if let Some(race_type) = args.race_type {
+        predicates.push(data::Predicate::Type { race_type });
+    }
+    let races = data::read_from_dir(args.dir.unwrap(), PredicateClosures::from(predicates))?;
+
+    let mut configs: HashMap<EventType, Config> = HashMap::new();
+    for race_type in [EventType::Thoroughbred, EventType::Greyhound] {
+        let filename = match race_type {
+            EventType::Thoroughbred => "config/thoroughbred.cf.json",
+            EventType::Greyhound => "config/greyhound.cf.json",
+            EventType::Harness => unimplemented!(),
+        };
+        debug!("loading {race_type} config from {filename}");
+        let config = Config {
+            coefficients: Coefficients::read_json_file(filename)?,
+            fit_options: Default::default(),
+        };
+        configs.insert(race_type, config);
+    }
+
+    let mut evaluations = Vec::with_capacity(races.len());
+    let num_races = races.len();
+    for (index, race_file) in races.into_iter().enumerate() {
+        debug!("fitting race: {race_file:?} ({} of {num_races})", index + 1);
+        let race = race_file.race.summarise();
+        let calibrator = Calibrator::try_from(configs[&race.race_type].clone())?;
+        let sample_top_n = TopN {
+            markets: (0..race.prices.rows())
+                .map(|rank| {
+                    let prices = race.prices.row_slice(rank).to_vec();
+                    Market::fit(&OVERROUND_METHOD, prices, rank as f64 + 1.)
+                })
+                .collect(),
+        };
+        let sample_wp = WinPlace {
+            win: sample_top_n.markets[0].clone(),
+            place: sample_top_n.markets[race.places_paying - 1].clone(),
+            places_paying: race.places_paying,
+        };
+        let sample_overrounds = sample_top_n.overrounds()?;
+        let model = calibrator.fit(sample_wp, &sample_overrounds)?.value;
+        let derived_prices = model.top_n.as_price_matrix();
+        let errors: Vec<_> = (0..derived_prices.rows())
+            .map(|rank| {
+                fit::compute_msre(
+                    &race.prices[rank],
+                    &derived_prices[rank],
+                    &fit::FITTED_PRICE_RANGES[rank],
+                )
+                .sqrt()
+            })
+            .collect();
+        let worst_rmsre = *errors.iter().max_by(|a, b| a.total_cmp(b)).unwrap();
+        debug!("worst_rmsre: {worst_rmsre}");
+        evaluations.push(Evaluation {
+            file: race_file.file,
+            race,
+            worst_rmsre,
+        });
+    }
+    let elapsed = start_time.elapsed();
+    info!(
+        "fitted {} races in {}s",
+        num_races,
+        elapsed.as_millis() as f64 / 1_000.
+    );
+
+    evaluations.sort_by(|a, b| a.worst_rmsre.total_cmp(&b.worst_rmsre));
+    let quantiles = find_quantiles(
+        &evaluations,
+        &[0.0, 0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99, 1.0],
+    );
+    let quantiles_table = tabulate_quantiles(&quantiles);
+    info!(
+        "quantiles:\n{}",
+        Console::default().render(&quantiles_table)
+    );
+
+    let best_subset = &evaluations[..usize::min(TOP_SUBSET, evaluations.len())];
+    let best_table = tabulate_subset(best_subset, 0);
+    info!(
+        "best races:\n{}",
+        Console::default().render(&best_table)
+    );
+
+    let start_index = evaluations.len().saturating_sub(TOP_SUBSET);
+    let worst_subset = &evaluations[start_index..];
+    let worst_table = tabulate_subset(worst_subset, start_index);
+    info!(
+        "worst races:\n{}",
+        Console::default().render(&worst_table)
+    );
+
+    Ok(())
+}
+
+fn find_quantiles(evaluations: &[Evaluation], quantiles: &[f64]) -> Vec<(f64, f64)> {
+    let mut quantile_values = Vec::with_capacity(quantiles.len());
+    for quantile in quantiles {
+        let index = f64::ceil(quantile * evaluations.len() as f64 - 1.) as usize;
+        quantile_values.push((*quantile, evaluations[index].worst_rmsre));
+    }
+    quantile_values
+}
+
+fn tabulate_subset(evaluations: &[Evaluation], start_index: usize) -> Table {
+    let mut table = Table::default()
+        .with_cols(vec![
+            Col::new(Styles::default().with(MinWidth(6))),
+            Col::new(Styles::default().with(MinWidth(12))),
+            Col::new(Styles::default().with(MinWidth(40))),
+            Col::new(Styles::default().with(MinWidth(14))),
+            Col::new(Styles::default().with(MinWidth(14))),
+        ])
+        .with_row(Row::new(
+            Styles::default().with(Header(true)),
+            vec!["Rank".into(), "Worst RMSRE".into(), "File".into(), "Race type".into(), "Places paying".into()],
+        ));
+    table.push_rows(evaluations.iter().enumerate().map(|(index, evaluation)| {
+        Row::new(
+            Styles::default(),
+            vec![
+                Cell::new(Styles::default().with(HAlign::Right), format!("{}", index + start_index + 1).into()),
+                Cell::new(Styles::default().with(HAlign::Right), format!("{:.6}", evaluation.worst_rmsre).into()),
+                Cell::new(Styles::default(), format!("{}", evaluation.file.to_str().unwrap()).into()),
+                Cell::new(Styles::default(), format!("{}", evaluation.race.race_type).into()),
+                Cell::new(Styles::default().with(HAlign::Right), format!("{:.6}", evaluation.race.places_paying).into())
+            ],
+        )
+    }));
+
+    table
+}
+
+fn tabulate_quantiles(quantiles: &[(f64, f64)]) -> Table {
+    let mut table = Table::default()
+        .with_cols(vec![
+            Col::new(Styles::default().with(MinWidth(12))),
+            Col::new(Styles::default().with(MinWidth(12))),
+        ])
+        .with_row(Row::new(
+            Styles::default().with(Header(true)),
+            vec!["Quantile".into(), "Worst RMSRE".into()],
+        ));
+    table.push_rows(quantiles.iter().map(|(quantile, rmsre)| {
+        Row::new(
+            Styles::default().with(HAlign::Right),
+            vec![
+                format!("{quantile:.3}").into(),
+                format!("{rmsre:.6}").into(),
+            ],
+        )
+    }));
+    table
+}
+
+#[derive(Debug)]
+struct Evaluation {
+    file: PathBuf,
+    race: RaceSummary,
+    worst_rmsre: f64,
+}
diff --git a/src/bin/prices.rs b/src/bin/prices.rs
@@ -69,8 +69,8 @@ async fn main() -> Result<(), Box<dyn Error>> {
     );
 
     let coefficients_file = match race.race_type {
-        EventType::Thoroughbred => PathBuf::from("config/thoroughbred.cf.json"),
-        EventType::Greyhound => PathBuf::from("config/greyhound.cf.json"),
+        EventType::Thoroughbred => "config/thoroughbred.cf.json",
+        EventType::Greyhound => "config/greyhound.cf.json",
         EventType::Harness => unimplemented!(),
     };
     debug!("loading coefficients from {coefficients_file:?}");