From 7e3635e66670d5ec4bc1fbfe439c59e8ba5269d6 Mon Sep 17 00:00:00 2001 From: Gregory James Comer Date: Fri, 3 Oct 2025 15:24:31 -0700 Subject: [PATCH] Revert "[Windows] Fix build issues using Clang-CL on Windows, add CI (#121)" This reverts commit 4ed91cc545e9ed7098e53747656eb7eff24eb305. --- .github/workflows/pull.yml | 56 --------------------------- .gitignore | 4 -- CMakeLists.txt | 6 +-- include/pytorch/tokenizers/compiler.h | 20 ---------- include/pytorch/tokenizers/tiktoken.h | 1 - setup.py | 9 ++--- src/hf_tokenizer.cpp | 4 +- test/test_tiktoken.cpp | 8 ---- 8 files changed, 8 insertions(+), 100 deletions(-) delete mode 100644 include/pytorch/tokenizers/compiler.h diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index dee3f7e..1bf6e29 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -34,59 +34,3 @@ jobs: # Run tests pytest - - unittest-windows: - uses: pytorch/test-infra/.github/workflows/windows_job.yml@main - with: - runner: windows.4xlarge - submodules: 'recursive' - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} - script: | - conda init powershell - powershell -Command "& { - Set-PSDebug -Trace 1 - \$ErrorActionPreference = 'Stop' - \$PSNativeCommandUseErrorActionPreference = \$true - - # Create a symbolic link to work around path length limitations. This gives a much shorter - # path, as the default checkout directory is deeply nested. - \$workingDir = \$PWD.Path - cd \$Env:GITHUB_WORKSPACE - New-Item -ItemType SymbolicLink -Path tk -Value \$workingDir - cd tk - - # Run C++ unit tests - cmake -DCMAKE_BUILD_TYPE=Debug test -Bbuild/test -T ClangCL - cmake --build build/test -j9 --config Debug - if (\$LASTEXITCODE -ne 0) { - Write-Host "Build was not successful. Exit code: \$LASTEXITCODE." - exit \$LASTEXITCODE - } - - Push-Location build/test - ctest - if (\$LASTEXITCODE -ne 0) { - Write-Host "Unit tests were not successful. Exit code: \$LASTEXITCODE." - exit \$LASTEXITCODE - } - Pop-Location - - conda create --yes --quiet -n tokenizers python=3.12 - conda activate tokenizers - - # Install tokenizers - pip install . -v - if (\$LASTEXITCODE -ne 0) { - Write-Host "Python installation was unsuccessful. Exit code: \$LASTEXITCODE." - exit \$LASTEXITCODE - } - pip install pytest blobfile transformers>=4.53.1 - - # Run python tests - pytest - if (\$LASTEXITCODE -ne 0) { - Write-Host "Python tests were not successful. Exit code: \$LASTEXITCODE." - Start-Sleep -Seconds 600 # Debug - keep alive to give time to SSH - exit \$LASTEXITCODE - } - }" diff --git a/.gitignore b/.gitignore index c45e3c0..700cb52 100644 --- a/.gitignore +++ b/.gitignore @@ -34,7 +34,3 @@ pip-out/ *~ .~lock.* *.idea - -*.so -*.dylib -*.pyd diff --git a/CMakeLists.txt b/CMakeLists.txt index 5fec1ea..2794d00 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -28,9 +28,7 @@ include(CMakePackageConfigHelpers) include(Utils.cmake) # Ignore weak attribute warning -if(NOT MSVC) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-attributes") -endif() +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-attributes") set(ABSL_ENABLE_INSTALL ON) set(ABSL_PROPAGATE_CXX_STD ON) @@ -51,7 +49,7 @@ endif() add_subdirectory( ${CMAKE_CURRENT_SOURCE_DIR}/third-party/sentencepiece - ${CMAKE_CURRENT_BINARY_DIR}/sp-build + ${CMAKE_CURRENT_BINARY_DIR}/sentencepiece-build EXCLUDE_FROM_ALL ) diff --git a/include/pytorch/tokenizers/compiler.h b/include/pytorch/tokenizers/compiler.h deleted file mode 100644 index 09d1b95..0000000 --- a/include/pytorch/tokenizers/compiler.h +++ /dev/null @@ -1,20 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - - #pragma once - -/** - * @file - * Compiler and platform-specific declarations. - */ - -#ifdef _WIN32 -#include -// ssize_t isn't available on Windows. Alias it to the Windows SSIZE_T value. -typedef SSIZE_T ssize_t; -#endif diff --git a/include/pytorch/tokenizers/tiktoken.h b/include/pytorch/tokenizers/tiktoken.h index 833f6ba..6cf9edc 100644 --- a/include/pytorch/tokenizers/tiktoken.h +++ b/include/pytorch/tokenizers/tiktoken.h @@ -19,7 +19,6 @@ // Local #include -#include #include #include #include diff --git a/setup.py b/setup.py index 63ced6c..008ff6a 100644 --- a/setup.py +++ b/setup.py @@ -30,6 +30,9 @@ class CMakeBuild(build_ext): def build_extension(self, ext): # noqa C901 extdir = os.path.abspath(os.path.dirname(self.get_ext_fullpath(ext.name))) + # Ensure the extension goes into the pytorch_tokenizers package directory + extdir = os.path.join(extdir, "pytorch_tokenizers") + # Required for auto-detection & inclusion of auxiliary "native" libs if not extdir.endswith(os.path.sep): extdir += os.path.sep @@ -52,10 +55,6 @@ def build_extension(self, ext): # noqa C901 ] build_args = ["--target", "pytorch_tokenizers_cpp"] - # Use Clang for Windows builds. - if sys.platform == "win32": - cmake_args += ["-T ClangCL"] - # Adding CMake arguments set as environment variable # (needed e.g. to build for ARM OSX on conda-forge) if "CMAKE_ARGS" in os.environ: @@ -133,7 +132,7 @@ def build_extension(self, ext): # noqa C901 long_description_content_type="text/markdown", url="https://github.com/meta-pytorch/tokenizers", packages=find_packages(), - ext_modules=[CMakeExtension("pytorch_tokenizers.pytorch_tokenizers_cpp")], + ext_modules=[CMakeExtension("pytorch_tokenizers_cpp")], cmdclass={"build_ext": CMakeBuild}, zip_safe=False, python_requires=">=3.10", diff --git a/src/hf_tokenizer.cpp b/src/hf_tokenizer.cpp index 03d7816..5c1bace 100644 --- a/src/hf_tokenizer.cpp +++ b/src/hf_tokenizer.cpp @@ -34,14 +34,14 @@ Error HFTokenizer::load(const std::string& path) { std::string model_config_json = ""; if (fs::is_directory(path)) { const fs::path root(path); - model_json = (root / "tokenizer.json").string(); + model_json = root / "tokenizer.json"; if (!fs::exists(model_json)) { TK_LOG(Info, "no tokenizer.json found in %s", path.c_str()); return Error::LoadFailure; } const auto model_config_json_path = root / "tokenizer_config.json"; if (fs::exists(model_config_json_path)) { - model_config_json = model_config_json_path.string(); + model_config_json = model_config_json_path; } } diff --git a/test/test_tiktoken.cpp b/test/test_tiktoken.cpp index 2f5dc76..5c7ce8b 100644 --- a/test/test_tiktoken.cpp +++ b/test/test_tiktoken.cpp @@ -124,11 +124,7 @@ TEST_F(TiktokenTest, ConstructionWithInvalidBOSIndex) { std::vector{"<|end_of_text|>"}), 1, 0), -#if !GTEST_OS_WINDOWS ::testing::KilledBySignal(SIGABRT), -#else - [](int exit_code) { return exit_code != 0; }, -#endif ""); #endif } @@ -143,11 +139,7 @@ TEST_F(TiktokenTest, ConstructionWithInvalidEOSIndex) { std::vector{"<|begin_of_text|>"}), 0, 1), -#if !GTEST_OS_WINDOWS ::testing::KilledBySignal(SIGABRT), -#else - [](int exit_code) { return exit_code != 0; }, -#endif ""); #endif }