Skip to content

Commit

Permalink
Add sharded in-memory index, python feature to enable rust-side usage…
Browse files Browse the repository at this point in the history
…, update publish action to include more targets
  • Loading branch information
luciaquirke committed Sep 2, 2024
1 parent fd81deb commit 23ad4f7
Show file tree
Hide file tree
Showing 10 changed files with 346 additions and 38 deletions.
95 changes: 68 additions & 27 deletions .github/workflows/publish-wheel.yml
Original file line number Diff line number Diff line change
@@ -1,49 +1,90 @@
name: Publish

on:
push:
tags:
- 'v*'
workflow_dispatch:

jobs:
publish:
name: Publish for ${{ matrix.target }}
build:
name: Build wheels for ${{ matrix.os }} - Python ${{ matrix.python-version }}
strategy:
fail-fast: false
matrix:
target:
- x86_64-unknown-linux-gnu
- x86_64-apple-darwin
- x86_64-pc-windows-msvc
os: [ubuntu-latest, macos-latest, windows-latest]
python-version: ['3.10', '3.11']
include:
- target: x86_64-unknown-linux-gnu
os: ubuntu-latest
- target: x86_64-apple-darwin
os: macos-latest
- target: x86_64-pc-windows-msvc
os: windows-latest
- os: ubuntu-latest
target: x86_64-unknown-linux-gnu
- os: macos-latest
target: x86_64-apple-darwin
- os: windows-latest
target: x86_64-pc-windows-msvc
# Add ARM64 builds
- os: ubuntu-latest
target: aarch64-unknown-linux-gnu
python-version: '3.10'
- os: macos-latest
target: aarch64-apple-darwin
python-version: '3.10'

runs-on: ${{ matrix.os }}
environment: PyPI
steps:
- uses: actions/checkout@v3
- name: Set up Python
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: '3.10'
python-version: ${{ matrix.python-version }}
- name: Set up Rust
uses: actions-rs/toolchain@v1
with:
profile: minimal
toolchain: stable
target: ${{ matrix.target }}
override: true
- name: Build wheels
uses: PyO3/maturin-action@v1
with:
target: ${{ matrix.target }}
args: --release --out dist --interpreter python${{ matrix.python-version }} --features python
manylinux: auto
- name: Build universal2 wheel for macOS
if: matrix.os == 'macos-latest' && matrix.python-version == '3.10'
uses: PyO3/maturin-action@v1
with:
args: --release --universal2 --out dist --interpreter python${{ matrix.python-version }} --features python
- name: Upload wheels
uses: actions/upload-artifact@v2
with:
name: wheels
path: dist

build-sdist:
name: Build source distribution
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Build sdist
uses: PyO3/maturin-action@v1
with:
command: sdist
args: --out dist
- name: Upload sdist
uses: actions/upload-artifact@v2
with:
name: wheels
path: dist

- name: Publish
uses: messense/maturin-action@v1
env:
MATURIN_PASSWORD: ${{ secrets.PYPI_API_TOKEN }}
with:
maturin-version: latest
command: publish
args: --target ${{ matrix.target }} --username=__token__ --skip-existing
publish:
name: Publish to PyPI
needs: [build, build-sdist]
runs-on: ubuntu-latest
steps:
- uses: actions/download-artifact@v2
with:
name: wheels
path: dist
- name: Publish to PyPI
uses: pypa/gh-action-pypi-publish@release/v1
with:
user: __token__
password: ${{ secrets.PYPI_API_TOKEN }}
packages_dir: dist/
skip_existing: true
4 changes: 4 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@ edition = "2021"
name = "tokengrams"
crate-type = ["cdylib", "rlib"]

[features]
default = []
python = ["pyo3/extension-module"]

[dependencies]
anyhow = "1.0.81"
bincode = "1.3.3"
Expand Down
1 change: 1 addition & 0 deletions src/bindings/mod.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
pub mod in_memory_index;
pub mod memmap_index;
pub mod sharded_memmap_index;
pub mod sharded_in_memory_index;
13 changes: 6 additions & 7 deletions src/in_memory_index.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
use anyhow::Result;
use funty::Unsigned;
use pyo3::prelude::*;
use rayon::prelude::*;
use std::collections::HashMap;
use std::fmt::Debug;
Expand Down Expand Up @@ -37,7 +36,7 @@ impl<T: Unsigned + Debug> InMemoryIndexRs<T> {
token_limit: Option<usize>,
vocab: usize,
verbose: bool,
) -> PyResult<Self> {
) -> Result<Self> {
let mut buffer = Vec::new();
let mut file = File::open(&path)?;

Expand All @@ -61,25 +60,25 @@ impl<T: Unsigned + Debug> InMemoryIndexRs<T> {

fn read_file_to_boxed_slice<E: Unsigned>(path: &str) -> Result<Box<[E]>> {
let mut file = File::open(path)?;
let file_len = file.metadata()?.len() as usize;
let file_len_bytes = file.metadata()?.len() as usize;

// Ensure file size is a multiple of size of E
if file_len % std::mem::size_of::<T>() != 0 {
if file_len_bytes % std::mem::size_of::<E>() != 0 {
anyhow::bail!("File size is not a multiple of element size");
}

let num_elements = file_len / std::mem::size_of::<E>();
let num_elements = file_len_bytes / std::mem::size_of::<E>();
let mut vec: Vec<E> = Vec::with_capacity(num_elements);
unsafe {
let buf = std::slice::from_raw_parts_mut(vec.as_mut_ptr() as *mut u8, file_len);
let buf = std::slice::from_raw_parts_mut(vec.as_mut_ptr() as *mut u8, file_len_bytes);
file.read_exact(buf)?;
vec.set_len(num_elements);
}

Ok(vec.into_boxed_slice())
}

pub fn from_disk(text_path: String, table_path: String, vocab: usize) -> PyResult<Self> {
pub fn from_disk(text_path: String, table_path: String, vocab: usize) -> Result<Self> {
let text = Self::read_file_to_boxed_slice::<T>(&text_path)?;
let table = Self::read_file_to_boxed_slice::<u64>(&table_path)?;

Expand Down
9 changes: 9 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,15 @@ pub mod mmap_slice;
pub use bindings::in_memory_index::InMemoryIndex;
pub use bindings::memmap_index::MemmapIndex;
pub use bindings::sharded_memmap_index::ShardedMemmapIndex;
pub use bindings::sharded_in_memory_index::ShardedInMemoryIndex;
pub use sharded_in_memory_index::ShardedInMemoryIndexRs;
pub use in_memory_index::InMemoryIndexRs;
pub use sample::Sample;

pub use table::SuffixTable;

/// Python bindings
#[cfg(feature = "python")]
use pyo3::prelude::*;

mod bindings;
Expand All @@ -13,13 +19,16 @@ mod memmap_index;
mod par_quicksort;
mod sample;
mod sharded_memmap_index;
mod sharded_in_memory_index;
mod table;
mod util;

#[cfg(feature = "python")]
#[pymodule]
fn tokengrams(m: &Bound<'_, PyModule>) -> PyResult<()> {
m.add_class::<InMemoryIndex>()?;
m.add_class::<MemmapIndex>()?;
m.add_class::<ShardedMemmapIndex>()?;
m.add_class::<ShardedInMemoryIndex>()?;
Ok(())
}
2 changes: 1 addition & 1 deletion src/sample.rs
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ pub trait Sample<T: Unsigned>: Send + Sync {
/// Warning: O(k**n) where k is vocabulary size, use with caution.
/// Improve smoothed model quality by replacing the default delta hyperparameters
/// for models of order n and below with improved estimates over the entire index.
/// https://people.eecs.berkeley.edu/~klein/cs294-5/chen_goodman.pdf, page 16."""
/// <https://people.eecs.berkeley.edu/~klein/cs294-5/chen_goodman.pdf/>, page 16.
fn estimate_deltas(&mut self, n: usize) {
for i in 1..n + 1 {
if self.get_cache().n_delta.contains_key(&i) {
Expand Down
Loading

0 comments on commit 23ad4f7

Please sign in to comment.