From 3c518d42ab4d1e30be5699eee9193e390b023c38 Mon Sep 17 00:00:00 2001 From: Master-Hash Date: Fri, 21 Feb 2025 14:04:24 +0100 Subject: [PATCH] refactor: Rust 2024 & use CStr literal --- .github/workflows/build.yml | 18 +- Cargo.lock | 28 +-- Cargo.toml | 2 +- src/lib.rs | 343 ++++++++++++++++++------------------ 4 files changed, 196 insertions(+), 195 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index b026a28..94e8a85 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -19,31 +19,37 @@ jobs: target: x86_64-pc-windows-gnu, os: windows-2025, feature: icu_segmenter, + bname: ewt.dll, } - { target: x86_64-pc-windows-gnu, os: windows-2025, feature: windows, + bname: ewt.dll, } - { target: x86_64-unknown-linux-gnu, os: ubuntu-24.04, feature: icu_segmenter, + bname: libewt.so, } - { target: aarch64-unknown-linux-gnu, os: ubuntu-24.04-arm, feature: icu_segmenter, + bname: libewt.so, } - { target: x86_64-apple-darwin, os: macos-15, feature: icu_segmenter, + bname: libewt.dylib, } - { target: aarch64-apple-darwin, os: macos-15, feature: icu_segmenter, + bname: libewt.dylib, } steps: - name: Checkout source code @@ -61,6 +67,7 @@ jobs: - name: Show version information (Rust, cargo) run: | + rustup update rustup -V rustup toolchain list rustup target add ${{ matrix.job.target }} @@ -70,16 +77,9 @@ jobs: - name: Build run: cargo build --locked --release --target=${{ matrix.job.target }} --no-default-features -F ${{ matrix.job.feature }} - - name: debug? - if: runner.os == 'Windows' || runner.os == 'Linux' - run: | - tree target/ - - name: "Artifact upload" uses: actions/upload-artifact@master with: - name: ${{ matrix.job.target }}-libEWT-${{ matrix.job.feature }} + name: libewt-${{ matrix.job.feature }}-${{ matrix.job.target }} path: | - target/${{ matrix.job.target }}/release/*.dll - target/${{ matrix.job.target }}/release/*.so - target/${{ matrix.job.target }}/release/*.dylib + target/${{ matrix.job.target }}/release/${{ matrix.job.bname }} diff --git a/Cargo.lock b/Cargo.lock index 71d0522..e033b95 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -65,9 +65,9 @@ dependencies = [ [[package]] name = "core_maths" -version = "0.1.0" +version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3b02505ccb8c50b0aa21ace0fc08c3e53adebd4e58caa18a36152803c7709a3" +checksum = "77745e017f5edba1a9c1d854f6f3a52dac8a12dd5af5d2f54aecf61e43d80d30" dependencies = [ "libm", ] @@ -271,9 +271,9 @@ checksum = "4ee93343901ab17bd981295f2cf0026d4ad018c7c31ba84549a4ddbb47a45104" [[package]] name = "log" -version = "0.4.25" +version = "0.4.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04cbf5b083de1c7e0222a7a51dbfdba1cbe1c6ab0b15e29fff3f6c077fd9cd9f" +checksum = "30bde2b3dc3671ae49d8e2e9f044c7c005836e7a023ee57cffa25ab82764bb9e" [[package]] name = "memchr" @@ -366,24 +366,24 @@ checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" [[package]] name = "rustc-hash" -version = "2.1.0" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7fb8039b3032c191086b10f11f319a6e99e1e82889c5cc6046f515c9db1d497" +checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" [[package]] name = "serde" -version = "1.0.217" +version = "1.0.218" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02fc4265df13d6fa1d00ecff087228cc0a2b5f3c0e87e258d8b94a156e984c70" +checksum = "e8dfc9d19bdbf6d17e22319da49161d5d0108e4188e8b680aef6299eed22df60" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.217" +version = "1.0.218" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a9bf7cf98d04a2b28aead066b7496853d4779c9cc183c440dbac457641e19a0" +checksum = "f09503e191f4e797cb8aac08e9a4a4695c5edf6a2e70e376d961ddd5c969f82b" dependencies = [ "proc-macro2", "quote", @@ -404,9 +404,9 @@ checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" [[package]] name = "syn" -version = "2.0.96" +version = "2.0.98" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d5d0adab1ae378d7f53bdebc67a39f1f151407ef230f0ce2883572f5d8985c80" +checksum = "36147f1a48ae0ec2b5b3bc5b537d267457555a10dc06f3dbc8cb11ba3006d3b1" dependencies = [ "proc-macro2", "quote", @@ -436,9 +436,9 @@ dependencies = [ [[package]] name = "unicode-ident" -version = "1.0.14" +version = "1.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83" +checksum = "00e2473a93778eb0bad35909dff6a10d28e63f792f16ed15e404fca9d5eeedbe" [[package]] name = "utf8_iter" diff --git a/Cargo.toml b/Cargo.toml index 30b3eec..5559a0b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "ewt" version = "0.2.0" -edition = "2021" +edition = "2024" license = "GPL-3.0-or-later" [lib] diff --git a/src/lib.rs b/src/lib.rs index 32b3fb3..56f0bfc 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -8,17 +8,16 @@ include!(concat!(env!("OUT_DIR"), "/bindings.rs")); use icu_segmenter::WordSegmenter; #[cfg(all(not(feature = "windows"), feature = "icu_segmenter"))] use itertools::Itertools; -use std::ffi::CString; use std::os::raw; use std::ptr; #[cfg(feature = "windows")] use std::sync::LazyLock; #[cfg(feature = "windows")] -use windows::core::h; +use windows::Data::Text::SelectableWordsSegmenter; #[cfg(feature = "windows")] use windows::core::HSTRING; #[cfg(feature = "windows")] -use windows::Data::Text::SelectableWordsSegmenter; +use windows::core::h; #[cfg(feature = "windows")] static segmenter: LazyLock = @@ -27,61 +26,60 @@ static segmenter: LazyLock = // icu segmenter seems unable to be initialized in a static variable // Rc is used inside, so it is not Sync -#[no_mangle] +#[unsafe(no_mangle)] #[allow(non_upper_case_globals)] pub static plugin_is_GPL_compatible: libc::c_int = 1; // ordinarily the Rust compiler will mangle funciton names. we don't want to do // that, since the C code won't know what code to call. // we'll also want to use `unsafe` because we need access to raw pointers -#[no_mangle] +#[unsafe(no_mangle)] pub unsafe extern "C" fn emacs_module_init(runtime: *mut emacs_runtime) -> libc::c_int { - let fset_string = CString::new("fset").unwrap(); - - let env = (*runtime).get_environment.unwrap()(runtime); - - let intern = (*env).intern.unwrap(); - let funcall = (*env).funcall.unwrap(); - let make_function = (*env).make_function.unwrap(); - - let Qfset = intern(env, fset_string.as_ptr()); - - let func_emt__do_split_helper_description = CString::new("This function takes a string and return an array of bounds. A bound is a cons with the starting position and the ending position of a word.").unwrap(); - let func_emt__word_at_point_or_forward_description = CString::new("This functions takes a string and a position, and returns the bound of the word at the position. If the position is at bound of two words, it returns the word at the right side of that position. This function does not tokenize the whole string, so it is faster in some cases.").unwrap(); - let func_emt__do_split_helper_name = CString::new("emt--do-split-helper").unwrap(); - let func_emt__word_at_point_or_forward_name = - CString::new("emt--word-at-point-or-forward-helper").unwrap(); - let Qsplit_helper = intern(env, func_emt__do_split_helper_name.as_ptr()); - let Qword_at_point = intern(env, func_emt__word_at_point_or_forward_name.as_ptr()); - let func_emt__do_split_helper = make_function( - env, - 1, - 1, - Some(Femt__do_split_helper), - func_emt__do_split_helper_description.as_ptr(), - std::ptr::null_mut(), - ); - funcall( - env, - Qfset, - 2, - [Qsplit_helper, func_emt__do_split_helper].as_mut_ptr(), - ); - let func_emt__do_split_helper = make_function( - env, - 2, - 2, - Some(Femt__word_at_point_or_forward), - func_emt__word_at_point_or_forward_description.as_ptr(), - std::ptr::null_mut(), - ); - funcall( - env, - Qfset, - 2, - [Qword_at_point, func_emt__do_split_helper].as_mut_ptr(), - ); - 0 + unsafe { + let env = (*runtime).get_environment.unwrap()(runtime); + + let intern = (*env).intern.unwrap(); + let funcall = (*env).funcall.unwrap(); + let make_function = (*env).make_function.unwrap(); + + let Qfset = intern(env, c"fset".as_ptr()); + + let func_emt__do_split_helper_description = c"This function takes a string and return an array of bounds. A bound is a cons with the starting position and the ending position of a word."; + let func_emt__word_at_point_or_forward_description = c"This functions takes a string and a position, and returns the bound of the word at the position. If the position is at bound of two words, it returns the word at the right side of that position. This function does not tokenize the whole string, so it is faster in some cases."; + let func_emt__do_split_helper_name = c"emt--do-split-helper"; + let func_emt__word_at_point_or_forward_name = c"emt--word-at-point-or-forward-helper"; + let Qsplit_helper = intern(env, func_emt__do_split_helper_name.as_ptr()); + let Qword_at_point = intern(env, func_emt__word_at_point_or_forward_name.as_ptr()); + let func_emt__do_split_helper = make_function( + env, + 1, + 1, + Some(Femt__do_split_helper), + func_emt__do_split_helper_description.as_ptr(), + std::ptr::null_mut(), + ); + funcall( + env, + Qfset, + 2, + [Qsplit_helper, func_emt__do_split_helper].as_mut_ptr(), + ); + let func_emt__do_split_helper = make_function( + env, + 2, + 2, + Some(Femt__word_at_point_or_forward), + func_emt__word_at_point_or_forward_description.as_ptr(), + std::ptr::null_mut(), + ); + funcall( + env, + Qfset, + 2, + [Qword_at_point, func_emt__do_split_helper].as_mut_ptr(), + ); + 0 + } } unsafe extern "C" fn Femt__do_split_helper( @@ -90,64 +88,65 @@ unsafe extern "C" fn Femt__do_split_helper( args: *mut emacs_value, data: *mut raw::c_void, ) -> emacs_value { - let cons_string = CString::new("cons").unwrap(); - let vector_string = CString::new("vector").unwrap(); - let intern = (*env).intern.unwrap(); - let funcall = (*env).funcall.unwrap(); - let make_integer = (*env).make_integer.unwrap(); - let copy_string_contents = (*env).copy_string_contents.unwrap(); - - let Qcons = intern(env, cons_string.as_ptr()); - let Qvector = intern(env, vector_string.as_ptr()); - - let mut len: isize = 0; - let is_ok = copy_string_contents(env, *args, ptr::null_mut(), &mut len); - let mut buf = vec![0u8; len as usize]; - let is_ok = copy_string_contents(env, *args, buf.as_mut_ptr() as *mut raw::c_char, &mut len); - - strip_trailing_zero_bytes(&mut buf); - - let param_u8 = String::from_utf8(buf).unwrap(); - #[cfg(feature = "windows")] - let mut consCell = { - let param_hstring = HSTRING::from(param_u8); - let res = segmenter.GetTokens(¶m_hstring).unwrap(); - - let iConsCell = res.into_iter().map(|i| { - let segment = i.SourceTextSegment().unwrap(); - let l = make_integer(env, segment.StartPosition.into()); - let r = make_integer(env, (segment.StartPosition + segment.Length).into()); - funcall(env, Qcons, 2, [l, r].as_mut_ptr()) - }); - iConsCell.collect::>() - }; - #[cfg(all(not(feature = "windows"), feature = "icu_segmenter"))] - let mut consCell = { - let segmenter_icu = WordSegmenter::new_auto(); - let segments = segmenter_icu - .segment_str(¶m_u8) - .tuple_windows() - .map(|(i, j)| ¶m_u8[i..j]); - let ss = segments.map(|s| s.chars().count()); - // we need prefix sum - // from: [4, 1, 4] - // to [(0, 4), (4, 5), (5, 9)] - let iConsCell = ss - .scan(0, |acc, x| { - let res = Some((*acc, *acc + x)); - *acc += x; - res - }) - .map(|(l, r)| { - let l = make_integer(env, l as i64); - let r = make_integer(env, r as i64); + unsafe { + let intern = (*env).intern.unwrap(); + let funcall = (*env).funcall.unwrap(); + let make_integer = (*env).make_integer.unwrap(); + let copy_string_contents = (*env).copy_string_contents.unwrap(); + + let Qcons = intern(env, c"cons".as_ptr()); + let Qvector = intern(env, c"vector".as_ptr()); + + let mut len: isize = 0; + let is_ok = copy_string_contents(env, *args, ptr::null_mut(), &mut len); + let mut buf = vec![0u8; len as usize]; + let is_ok = + copy_string_contents(env, *args, buf.as_mut_ptr() as *mut raw::c_char, &mut len); + + strip_trailing_zero_bytes(&mut buf); + + let param_u8 = String::from_utf8(buf).unwrap(); + #[cfg(feature = "windows")] + let mut consCell = { + let param_hstring = HSTRING::from(param_u8); + let res = segmenter.GetTokens(¶m_hstring).unwrap(); + + let iConsCell = res.into_iter().map(|i| { + let segment = i.SourceTextSegment().unwrap(); + let l = make_integer(env, segment.StartPosition.into()); + let r = make_integer(env, (segment.StartPosition + segment.Length).into()); funcall(env, Qcons, 2, [l, r].as_mut_ptr()) }); - iConsCell.collect::>() - }; - let l = consCell.len(); - let ddd = consCell.as_mut_ptr(); - funcall(env, Qvector, l as isize, ddd) + iConsCell.collect::>() + }; + #[cfg(all(not(feature = "windows"), feature = "icu_segmenter"))] + let mut consCell = { + let segmenter_icu = WordSegmenter::new_auto(); + let segments = segmenter_icu + .segment_str(¶m_u8) + .tuple_windows() + .map(|(i, j)| ¶m_u8[i..j]); + let ss = segments.map(|s| s.chars().count()); + // we need prefix sum + // from: [4, 1, 4] + // to [(0, 4), (4, 5), (5, 9)] + let iConsCell = ss + .scan(0, |acc, x| { + let res = Some((*acc, *acc + x)); + *acc += x; + res + }) + .map(|(l, r)| { + let l = make_integer(env, l as i64); + let r = make_integer(env, r as i64); + funcall(env, Qcons, 2, [l, r].as_mut_ptr()) + }); + iConsCell.collect::>() + }; + let l = consCell.len(); + let ddd = consCell.as_mut_ptr(); + funcall(env, Qvector, l as isize, ddd) + } } unsafe extern "C" fn Femt__word_at_point_or_forward( @@ -156,71 +155,73 @@ unsafe extern "C" fn Femt__word_at_point_or_forward( args: *mut emacs_value, data: *mut raw::c_void, ) -> emacs_value { - let cons_string = CString::new("cons").unwrap(); - let intern = (*env).intern.unwrap(); - let funcall = (*env).funcall.unwrap(); - let make_integer = (*env).make_integer.unwrap(); - let extract_integer = (*env).extract_integer.unwrap(); - let copy_string_contents = (*env).copy_string_contents.unwrap(); - - let Qcons = intern(env, cons_string.as_ptr()); - - let mut len: isize = 0; - let is_ok = copy_string_contents(env, *args, ptr::null_mut(), &mut len); - let mut buf = vec![0u8; len as usize]; - let is_ok = copy_string_contents(env, *args, buf.as_mut_ptr() as *mut raw::c_char, &mut len); - - strip_trailing_zero_bytes(&mut buf); - - let n = extract_integer(env, *args.offset(1)); - - let param_u8 = String::from_utf8(buf).unwrap(); - #[cfg(feature = "windows")] - let (l, r) = { - let param_hstring = HSTRING::from(param_u8); - let res = segmenter - .GetTokenAt(¶m_hstring, n.try_into().unwrap()) - .unwrap(); - - let segment = res.SourceTextSegment().unwrap(); - let l = make_integer(env, segment.StartPosition.into()); - let r = make_integer(env, (segment.StartPosition + segment.Length).into()); - (l, r) - }; - #[cfg(all(not(feature = "windows"), feature = "icu_segmenter"))] - let (l, r) = { - // Sadly WordSegmenter does not provide a way to get the nth token - let segmenter_icu = WordSegmenter::new_auto(); - let segments = segmenter_icu - .segment_str(¶m_u8) - .tuple_windows() - .map(|(i, j)| ¶m_u8[i..j]); - let mut ss = segments.map(|s| s.chars().count()); - // we need prefix sum - // from: [4, 1, 4], 4 - // to [(4, 5)] - // from: [4, 1, 4], 6 - // to [(5, 9)] - let iConsCell = ss.try_fold(0, |acc, x| { - let r = acc + x; - let l = acc; - if n < r.try_into().unwrap() { - Err((l, r)) - } else { - Ok(r) - } - }); - match iConsCell { - // Seems all the program will be panic if we reach here - Ok(_) => unreachable!(), - Err((l, r)) => { - let l = make_integer(env, l as i64); - let r = make_integer(env, r as i64); - (l, r) + unsafe { + let intern = (*env).intern.unwrap(); + let funcall = (*env).funcall.unwrap(); + let make_integer = (*env).make_integer.unwrap(); + let extract_integer = (*env).extract_integer.unwrap(); + let copy_string_contents = (*env).copy_string_contents.unwrap(); + + let Qcons = intern(env, c"cons".as_ptr()); + + let mut len: isize = 0; + let is_ok = copy_string_contents(env, *args, ptr::null_mut(), &mut len); + let mut buf = vec![0u8; len as usize]; + let is_ok = + copy_string_contents(env, *args, buf.as_mut_ptr() as *mut raw::c_char, &mut len); + + strip_trailing_zero_bytes(&mut buf); + + let n = extract_integer(env, *args.offset(1)); + + let param_u8 = String::from_utf8(buf).unwrap(); + #[cfg(feature = "windows")] + let (l, r) = { + let param_hstring = HSTRING::from(param_u8); + let res = segmenter + .GetTokenAt(¶m_hstring, n.try_into().unwrap()) + .unwrap(); + + let segment = res.SourceTextSegment().unwrap(); + let l = make_integer(env, segment.StartPosition.into()); + let r = make_integer(env, (segment.StartPosition + segment.Length).into()); + (l, r) + }; + #[cfg(all(not(feature = "windows"), feature = "icu_segmenter"))] + let (l, r) = { + // Sadly WordSegmenter does not provide a way to get the nth token + let segmenter_icu = WordSegmenter::new_auto(); + let segments = segmenter_icu + .segment_str(¶m_u8) + .tuple_windows() + .map(|(i, j)| ¶m_u8[i..j]); + let mut ss = segments.map(|s| s.chars().count()); + // we need prefix sum + // from: [4, 1, 4], 4 + // to [(4, 5)] + // from: [4, 1, 4], 6 + // to [(5, 9)] + let iConsCell = ss.try_fold(0, |acc, x| { + let r = acc + x; + let l = acc; + if n < r.try_into().unwrap() { + Err((l, r)) + } else { + Ok(r) + } + }); + match iConsCell { + // Seems all the program will be panic if we reach here + Ok(_) => unreachable!(), + Err((l, r)) => { + let l = make_integer(env, l as i64); + let r = make_integer(env, r as i64); + (l, r) + } } - } - }; - funcall(env, Qcons, 2, [l, r].as_mut_ptr()) + }; + funcall(env, Qcons, 2, [l, r].as_mut_ptr()) + } } // Thank you,