diff --git a/.gitattributes b/.gitattributes index 70d7a72..3ade302 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,5 +1,8 @@ -# Ref: https://stackoverflow.com/questions/19052834/is-it-possible-to-exclude-files-from-git-language-statistics -data/ZhConversion.php linguist-vendored +# Exclude external ruleset files from GitHub PL stats +# ref: https://stackoverflow.com/questions/19052834/is-it-possible-to-exclude-files-from-git-language-statistics +# And prevent auto CRLF conversion to avoid checksum mismatch +data/ZhConversion.php linguist-vendored binary +data/*.txt linguist-vendored binary data/cgroups/*.json linguist-vendored web/public/cgroups.json linguist-vendored benches/*.txt linguist-vendored diff --git a/build.rs b/build.rs index 131014c..f9fa6cb 100644 --- a/build.rs +++ b/build.rs @@ -353,7 +353,8 @@ fn read_and_validate_file(path: &str, sha256sum: &[u8; 32]) -> String { assert_eq!( &sha256(&content), sha256sum, - "Validating the checksum of zhconv" + "Validating the checksum of {}", + path.display() ); content } diff --git a/src/lib.rs b/src/lib.rs index 329900d..2b5214c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -3,7 +3,7 @@ //! with the leftmost-longest matching strategy and linear time complexity with respect to the //! length of input text and conversion rules. It ships with a bunch of conversion tables, //! extracted from [zhConversion.php](https://phabricator.wikimedia.org/source/mediawiki/browse/master/includes/languages/data/ZhConversion.php) -//! which is maintained and used by MediaWiki and Chinese Wikipedia. +//! (maintained by MediaWiki and Chinese Wikipedia) and [OpenCC](https://github.com/BYVoid/OpenCC/tree/master/data/dictionary). //! //! While built-in datasets work well for general case, the converter is never meant to be 100% //! accurate, especially for professional text. In Chinese Wikipedia, it is pretty common for