diff --git a/.gitignore b/.gitignore index c607031..8d89e22 100644 --- a/.gitignore +++ b/.gitignore @@ -130,5 +130,6 @@ dmypy.json # Pyre type checker .pyre/ -testdata -.vscode \ No newline at end of file + +# vscode debug config +.vscode/ \ No newline at end of file diff --git a/src/lean_dojo/__init__.py b/src/lean_dojo/__init__.py index a1297d6..5024a5d 100644 --- a/src/lean_dojo/__init__.py +++ b/src/lean_dojo/__init__.py @@ -28,8 +28,3 @@ from .interaction.parse_goals import Declaration, Goal, parse_goals from .data_extraction.lean import get_latest_commit, LeanGitRepo, LeanFile, Theorem, Pos from .constants import __version__ - -if os.geteuid() == 0: - logger.warning( - "Running LeanDojo as the root user may cause unexpected issues. Proceed with caution." - ) diff --git a/src/lean_dojo/constants.py b/src/lean_dojo/constants.py index aa50261..fd3f74b 100644 --- a/src/lean_dojo/constants.py +++ b/src/lean_dojo/constants.py @@ -71,20 +71,28 @@ assert re.fullmatch(r"\d+g", TACTIC_MEMORY_LIMIT) -def check_git_version(min_version: Tuple[int, int, int]) -> Tuple[int, int, int]: +def check_git_version(min_version: Tuple[int, int, int]) -> None: """Check the version of Git installed on the system.""" - res = subprocess.run("git --version", shell=True, capture_output=True, check=True) - output = res.stdout.decode() - error = res.stderr.decode() - assert error == "", error - m = re.match(r"git version (?P[0-9.]+)", output) - version = tuple(int(_) for _ in m["version"].split(".")) - - version_str = ".".join(str(_) for _ in version) - min_version_str = ".".join(str(_) for _ in min_version) - assert ( - version >= min_version - ), f"Git version {version_str} is too old. Please upgrade to at least {min_version_str}." + try: + res = subprocess.run( + "git --version", shell=True, capture_output=True, check=True + ) + output = res.stdout.decode().strip() + error = res.stderr.decode() + assert error == "", error + match = re.search(r"git version (\d+\.\d+\.\d+)", output) + if not match: + raise ValueError("Could not parse Git version from the output.") + # Convert version number string to tuple of integers + version = tuple(int(_) for _ in match.group(1).split(".")) + + version_str = ".".join(str(_) for _ in version) + min_version_str = ".".join(str(_) for _ in min_version) + assert ( + version >= min_version + ), f"Git version {version_str} is too old. Please upgrade to at least {min_version_str}." + except subprocess.CalledProcessError as e: + raise Exception(f"Failed to run git command: {e}") check_git_version((2, 25, 0)) diff --git a/src/lean_dojo/data_extraction/cache.py b/src/lean_dojo/data_extraction/cache.py index 5517895..2148966 100644 --- a/src/lean_dojo/data_extraction/cache.py +++ b/src/lean_dojo/data_extraction/cache.py @@ -13,7 +13,6 @@ from ..utils import ( execute, url_exists, - get_repo_info, report_critical_failure, ) from ..constants import ( @@ -34,11 +33,6 @@ def _split_git_url(url: str) -> Tuple[str, str]: return user_name, repo_name -def _format_dirname(url: str, commit: str) -> str: - user_name, repo_name = _split_git_url(url) - return f"{user_name}-{repo_name}-{commit}" - - _CACHE_CORRPUTION_MSG = "The cache may have been corrputed!" @@ -59,16 +53,16 @@ def __post_init__(self): lock_path = self.cache_dir.with_suffix(".lock") object.__setattr__(self, "lock", FileLock(lock_path)) - def get(self, url: str, commit: str) -> Optional[Path]: + def get(self, rel_cache_dir: Path) -> Optional[Path]: """Get the path of a traced repo with URL ``url`` and commit hash ``commit``. Return None if no such repo can be found.""" - _, repo_name = _split_git_url(url) - dirname = _format_dirname(url, commit) + dirname = rel_cache_dir.parent dirpath = self.cache_dir / dirname + cache_path = self.cache_dir / rel_cache_dir with self.lock: if dirpath.exists(): - assert (dirpath / repo_name).exists() - return dirpath / repo_name + assert cache_path.exists() + return cache_path elif not DISABLE_REMOTE_CACHE: url = os.path.join(REMOTE_CACHE_URL, f"{dirname}.tar.gz") @@ -83,20 +77,23 @@ def get(self, url: str, commit: str) -> Optional[Path]: with tarfile.open(f"{dirpath}.tar.gz") as tar: tar.extractall(self.cache_dir) os.remove(f"{dirpath}.tar.gz") - assert (dirpath / repo_name).exists() + assert (cache_path).exists() - return dirpath / repo_name + return cache_path else: return None - def store(self, src: Path, cache_path: Union[Path, None]=None) -> Path: - """Store a traced repo at path ``src``. Return its path in the cache.""" - if cache_path is None: - url, commit = get_repo_info(src) - _, repo_name = _split_git_url(url) - cache_path = self.cache_dir / _format_dirname(url, commit) / repo_name - if not cache_path.exists(): + def store(self, src: Path, rel_cache_dir: Path) -> Path: + """Store a traced repo at path ``src``. Return its path in the cache. + + Args: + src (Path): Path to the repo. + rel_cache_name (Path): The relative path of the stored repo in the cache. + """ + dirpath = self.cache_dir / rel_cache_dir.parent + cache_path = self.cache_dir / rel_cache_dir + if not dirpath.exists(): with self.lock: with report_critical_failure(_CACHE_CORRPUTION_MSG): shutil.copytree(src, cache_path) diff --git a/src/lean_dojo/data_extraction/lean.py b/src/lean_dojo/data_extraction/lean.py index 3f210e5..a79918e 100644 --- a/src/lean_dojo/data_extraction/lean.py +++ b/src/lean_dojo/data_extraction/lean.py @@ -18,19 +18,21 @@ from github.Repository import Repository from github.GithubException import GithubException from typing import List, Dict, Any, Generator, Union, Optional, Tuple, Iterator -from git import Repo, GitCommandError +from git import Repo, BadName +from ..constants import TMP_DIR import uuid - +import shutil +from urllib.parse import urlparse +from .cache import cache as repo_cache +from .cache import _split_git_url from ..utils import ( - execute, read_url, url_exists, get_repo_info, working_directory, - repo_type_of_url, + is_git_repo, ) -from ..constants import LEAN4_URL, TMP_DIR - +from ..constants import LEAN4_URL GITHUB_ACCESS_TOKEN = os.getenv("GITHUB_ACCESS_TOKEN", None) """GiHub personal access token is optional. @@ -47,49 +49,86 @@ ) GITHUB = Github() -# LEAN4_REPO = GITHUB.get_repo("leanprover/lean4") LEAN4_REPO = None """The GitHub Repo for Lean 4 itself.""" _URL_REGEX = re.compile(r"(?P.*?)/*") +_SSH_TO_HTTPS_REGEX = re.compile(r"^git@github\.com:(.+)/(.+)(?:\.git)?$") + +REPO_CACHE_PREFIX = "repos" + -def normalize_url(url: str, repo_type:str ='github') -> str: - if repo_type == 'local': - return os.path.abspath(url) # Convert to absolute path if local +def normalize_url(url: str, repo_type: str = "github") -> str: + if repo_type == "local": + return os.path.abspath(url) # Convert to absolute path if local return _URL_REGEX.fullmatch(url)["url"] # Remove trailing `/`. + +def repo_type_of_url(url: str) -> Union[str, None]: + """Get the type of the repository. + + Args: + url (str): The URL of the repository. + Returns: + str: The type of the repository. + """ + m = _SSH_TO_HTTPS_REGEX.match(url) + url = f"https://github.com/{m.group(1)}/{m.group(2)}" if m else url + parsed_url = urlparse(url) + if parsed_url.scheme in ["http", "https"]: + # case 1 - GitHub URL + if "github.com" in url: + if not url.startswith("https://"): + logger.warning(f"{url} should start with https://") + return + else: + return "github" + # case 2 - remote URL + elif url_exists(url): # not check whether it is a git URL + return "remote" + # case 3 - local path + elif is_git_repo(Path(parsed_url.path)): + return "local" + logger.warning(f"{url} is not a valid URL") + return None + + @cache -def url_to_repo(url: str, num_retries: int = 2, repo_type: str = 'github') -> Repo: +def url_to_repo( + url: str, + num_retries: int = 2, + repo_type: Union[str, None] = None, + tmp_dir: Union[Path] = None, +) -> Union[Repo, Repository]: """Convert a URL to a Repo object. - + Args: url (str): The URL of the repository. num_retries (int): Number of retries in case of failure. - repo_type (str): The type of the repository. Defaults to 'github'. + repo_type (Optional[str]): The type of the repository. Defaults to None. + tmp_dir (Optional[Path]): The temporary directory to clone the repo to. Defaults to None. Returns: Repo: A Git Repo object. """ url = normalize_url(url) backoff = 1 - temp_dir = os.path.join(TMP_DIR or '/tmp', str(uuid.uuid4())[:8]) - + tmp_dir = tmp_dir or os.path.join(TMP_DIR or "/tmp", str(uuid.uuid4())[:8]) + repo_type = repo_type or repo_type_of_url(url) + assert repo_type is not None, f"Invalid url {url}" while True: try: - if repo_type == 'github': + if repo_type == "github": return GITHUB.get_repo("/".join(url.split("/")[-2:])) - elif repo_type == 'local': - if not os.path.exists(url): - raise ValueError(f"Local path {url} does not exist") - try: - Repo(url) - except: - raise ValueError(f"Local path {url} is not a git repo") - shutil.copytree(url, f"{temp_dir}/{os.path.basename(url)}") - return Repo(f"{temp_dir}/{os.path.basename(url)}") - else: - return Repo.clone_from(url, f"{temp_dir}/{os.path.basename(url)}") + with working_directory(tmp_dir): + repo_name = os.path.basename(url) + if repo_type == "local": + assert is_git_repo(url), f"Local path {url} is not a git repo" + shutil.copytree(url, repo_name) + return Repo(repo_name) + else: + return Repo.clone_from(url, repo_name) except Exception as ex: if num_retries <= 0: raise ex @@ -98,11 +137,15 @@ def url_to_repo(url: str, num_retries: int = 2, repo_type: str = 'github') -> Re time.sleep(backoff) backoff *= 2 + @cache def get_latest_commit(url: str) -> str: """Get the hash of the latest commit of the Git repo at ``url``.""" repo = url_to_repo(url) - return repo.get_branch(repo.default_branch).commit.sha + if isinstance(repo, Repository): + return repo.get_branch(repo.default_branch).commit.sha + else: + return repo.head.commit.hexsha def cleanse_string(s: Union[str, Path]) -> str: @@ -110,7 +153,6 @@ def cleanse_string(s: Union[str, Path]) -> str: return str(s).replace("/", "_").replace(":", "_") -@cache def _to_commit_hash(repo: Union[Repository, Repo], label: str) -> str: """Convert a tag or branch to a commit hash.""" # GitHub repository @@ -122,16 +164,19 @@ def _to_commit_hash(repo: Union[Repository, Repo], label: str) -> str: raise ValueError(f"Invalid tag or branch: `{label}` for {repo.name}") # Local or remote Git repository elif isinstance(repo, Repo): - logger.debug(f"Querying the commit hash for {repo.working_dir} repository {label}") + logger.debug( + f"Querying the commit hash for {repo.working_dir} repository {label}" + ) try: # Resolve the label to a commit hash commit = repo.commit(label).hexsha - except GitCommandError as e: + except Exception as e: raise ValueError(f"Error converting ref to commit hash: {e}") else: raise TypeError("Unsupported repository type") return commit + @dataclass(eq=True, unsafe_hash=True) class Pos: """Position in source files. @@ -351,6 +396,11 @@ def __getitem__(self, key) -> str: _LEAN4_VERSION_REGEX = re.compile(r"leanprover/lean4:(?P.+?)") +def is_commit_hash(s: str): + """Check if a string is a valid commit hash.""" + return len(s) == 40 and _COMMIT_REGEX.fullmatch(s) + + def get_lean4_version_from_config(toolchain: str) -> str: """Return the required Lean version given a ``lean-toolchain`` config.""" m = _LEAN4_VERSION_REGEX.fullmatch(toolchain.strip()) @@ -451,34 +501,38 @@ def __post_init__(self) -> None: raise ValueError(f"{self.url} is not a valid URL") object.__setattr__(self, "repo_type", repo_type) object.__setattr__(self, "url", normalize_url(self.url, repo_type=repo_type)) - object.__setattr__(self, "repo", url_to_repo(self.url, repo_type=repo_type)) - if self.repo_type != 'github': # checkout to the commit for the `git.Repo` object - self.repo.git.checkout(self.commit) - - # Convert tags or branches to commit hashes - if not (len(self.commit) == 40 and _COMMIT_REGEX.fullmatch(self.commit)): - if (self.url, self.commit) in info_cache.tag2commit: - commit = info_cache.tag2commit[(self.url, self.commit)] - else: - commit = _to_commit_hash(self.repo, self.commit) - assert _COMMIT_REGEX.fullmatch(commit), f"Invalid commit hash: {commit}" - info_cache.tag2commit[(self.url, self.commit)] = commit - object.__setattr__(self, "commit", commit) + # set repo and commit + if repo_type == "github": + repo = url_to_repo(self.url, repo_type=repo_type) + # Convert tags or branches to commit hashes + if not is_commit_hash(self.commit): + if (self.url, self.commit) in info_cache.tag2commit: + commit = info_cache.tag2commit[(self.url, self.commit)] + else: + commit = _to_commit_hash(repo, self.commit) + assert is_commit_hash(commit), f"Invalid commit hash: {commit}" + info_cache.tag2commit[(self.url, commit)] = commit + object.__setattr__(self, "commit", commit) + else: + # get repo from cache + cache_repo_path = repo_cache.get( + REPO_CACHE_PREFIX / self.format_dirname / self.name + ) + # clone and store the repo if not in cache + if cache_repo_path is None: + cache_repo_path = self.add_to_cache() + repo = Repo(cache_repo_path) + object.__setattr__(self, "commit", _to_commit_hash(repo, self.commit)) + object.__setattr__(self, "repo", repo) # Determine the required Lean version. if (self.url, self.commit) in info_cache.lean_version: lean_version = info_cache.lean_version[(self.url, self.commit)] - elif self.is_lean4: - lean_version = self.commit + if self.is_lean4: + lean_version = "latest" # lean4 itself else: config = self.get_config("lean-toolchain") - toolchain = config["content"] - m = _LEAN4_VERSION_REGEX.fullmatch(toolchain.strip()) - if m is not None: - lean_version = m["version"] - else: - # lean_version_commit = get_lean4_commit_from_config(config) - lean_version = get_lean4_version_from_config(toolchain) + lean_version = get_lean4_version_from_config(config["content"]) if not is_supported_version(lean_version): logger.warning( f"{self} relies on an unsupported Lean version: {lean_version}" @@ -487,12 +541,10 @@ def __post_init__(self) -> None: object.__setattr__(self, "lean_version", lean_version) @classmethod - def from_path(cls, path: Union[str, Path]) -> "LeanGitRepo": + def from_path(cls, path: Union[Path, str]) -> "LeanGitRepo": """Construct a :class:`LeanGitRepo` object from the path to a local Git repo.""" - if isinstance(path, str): - path = Path(path) - url, commit = get_repo_info(path) - return cls(url, commit) + commit = Repo(path).head.commit.hexsha + return cls(str(path), commit) @property def name(self) -> str: @@ -504,30 +556,52 @@ def is_lean4(self) -> bool: @property def commit_url(self) -> str: - return os.path.join(self.url, f"tree/{self.commit}") + return f"{self.url}/tree/{self.commit}" + + @property + def format_dirname(self) -> str: + """Return the formatted cache directory name""" + if self.repo_type == "github": + user_name, repo_name = _split_git_url(self.url) + else: # f"gitpython-{repo_name}-{commit}" + user_name, repo_name = "gitpython", self.name + return Path(f"{user_name}-{repo_name}-{self.commit}") + + def add_to_cache(self) -> Path: + """Store the repo in the cache directory.""" + assert self.repo_type in [ + "local", + "remote", + ], f"Unsupported cache repo type: {self.repo_type}" + with working_directory() as tmp_dir: + repo = url_to_repo(self.url, repo_type=self.repo_type, tmp_dir=tmp_dir) + commit = _to_commit_hash(repo, self.commit) + repo.git.checkout(commit) + rel_cache_dir = REPO_CACHE_PREFIX / self.format_dirname / self.name + return repo_cache.store(repo.working_dir, rel_cache_dir) def show(self) -> None: """Show the repo in the default browser.""" webbrowser.open(self.commit_url) def exists(self) -> bool: - if self.repo_type == 'local': - return os.path.exists(self.url) + if self.repo_type == "local": + try: + self.repo.commit(self.commit) + return True + except BadName: + logger.warning( + f"Commit {self.commit} does not exist in this repository." + ) + return False return url_exists(self.commit_url) def clone_and_checkout(self) -> None: """Clone the repo to the current working directory and checkout a specific commit.""" logger.debug(f"Cloning {self}") - if self.repo_type == 'github' or self.repo_type == 'remote': - execute(f"git clone -n --recursive {self.url}", capture_output=True) - with working_directory(self.name): - execute( - f"git checkout {self.commit} && git submodule update --recursive", - capture_output=True, - ) - else: - shutil.copytree(self.url, self.name) - Repo(self.name).git.checkout(self.commit) + repo = Repo.clone_from(self.url, Path(self.name), no_checkout=True) + repo.git.checkout(self.commit) + repo.submodule_update(init=True, recursive=True) def get_dependencies( self, path: Union[str, Path, None] = None @@ -661,7 +735,7 @@ def _get_config_url(self, filename: str) -> str: def get_config(self, filename: str, num_retries: int = 2) -> Dict[str, Any]: """Return the repo's files.""" - if self.repo_type == 'github': + if self.repo_type == "github": config_url = self._get_config_url(filename) content = read_url(config_url, num_retries) else: diff --git a/src/lean_dojo/data_extraction/trace.py b/src/lean_dojo/data_extraction/trace.py index 963f01d..1fae85a 100644 --- a/src/lean_dojo/data_extraction/trace.py +++ b/src/lean_dojo/data_extraction/trace.py @@ -16,7 +16,7 @@ from subprocess import CalledProcessError from typing import Union, Optional, List, Generator -from .cache import cache, get_repo_info, _format_dirname +from .cache import cache from .lean import LeanGitRepo from ..constants import NUM_PROCS from .traced_data import TracedRepo @@ -204,7 +204,7 @@ def get_traced_repo_path(repo: LeanGitRepo, build_deps: bool = True) -> Path: Returns: Path: The path of the traced repo in the cache, e.g. :file:`/home/kaiyu/.cache/lean_dojo/leanprover-community-mathlib-2196ab363eb097c008d4497125e0dde23fb36db2` """ - path = cache.get(repo.url, repo.commit) + path = cache.get(repo.format_dirname / repo.name) if path is None: logger.info(f"Tracing {repo}") with working_directory() as tmp_dir: @@ -212,10 +212,9 @@ def get_traced_repo_path(repo: LeanGitRepo, build_deps: bool = True) -> Path: _trace(repo, build_deps) traced_repo = TracedRepo.from_traced_files(tmp_dir / repo.name, build_deps) traced_repo.save_to_disk() - # cache path: $HOME/.cache/lean_dojo/{_format_dirname}/{repo_name} - url, commit = get_repo_info(Path(repo.url)) - cache_path = cache.cache_dir / _format_dirname(url, commit) / repo.name - path = cache.store(tmp_dir / repo.name, cache_path) + src_dir = tmp_dir / repo.name + rel_cache_dir = Path(repo.format_dirname) / repo.name + path = cache.store(src_dir, rel_cache_dir) else: logger.debug("The traced repo is available in the cache.") return path diff --git a/src/lean_dojo/utils.py b/src/lean_dojo/utils.py index 63b19d9..cc3e211 100644 --- a/src/lean_dojo/utils.py +++ b/src/lean_dojo/utils.py @@ -5,7 +5,7 @@ import os import ray import time -import urllib +import urllib, urllib.request, urllib.error import typing import hashlib import tempfile @@ -144,9 +144,10 @@ def camel_case(s: str) -> str: """Convert the string ``s`` to camel case.""" return _CAMEL_CASE_REGEX.sub(" ", s).title().replace(" ", "") + def repo_type_of_url(url: str) -> str: """Get the type of the repository. - + Args: url (str): The URL of the repository. @@ -171,6 +172,7 @@ def repo_type_of_url(url: str) -> str: else: logger.warning(f"{url} is not a valid URL") + @cache def get_repo_info(path: Path) -> Tuple[str, str]: """Get the URL and commit hash of the Git repo at ``path``. @@ -181,7 +183,7 @@ def get_repo_info(path: Path) -> Tuple[str, str]: Returns: Tuple[str, str]: URL and (most recent) hash commit """ - url = str(path.absolute()) # use the absolute path + url = str(path.absolute()) # use the absolute path # Get the commit. commit_msg, _ = execute(f"git log -n 1", capture_output=True) m = re.search(r"(?<=^commit )[a-z0-9]+", commit_msg) @@ -215,7 +217,11 @@ def read_url(url: str, num_retries: int = 2) -> str: backoff = 1 while True: try: - with urllib.request.urlopen(url) as f: + request = urllib.request.Request(url) + gh_token = os.getenv("GITHUB_ACCESS_TOKEN") + if gh_token is not None: + request.add_header("Authorization", f"token {gh_token}") + with urllib.request.urlopen(request) as f: return f.read().decode() except Exception as ex: if num_retries <= 0: @@ -228,9 +234,13 @@ def read_url(url: str, num_retries: int = 2) -> str: @cache def url_exists(url: str) -> bool: - """Return True if the URL ``url`` exists.""" + """Return True if the URL ``url`` exists, using the GITHUB_ACCESS_TOKEN for authentication if provided.""" try: - with urllib.request.urlopen(url) as _: + request = urllib.request.Request(url) + gh_token = os.getenv("GITHUB_ACCESS_TOKEN") + if gh_token is not None: + request.add_header("Authorization", f"token {gh_token}") + with urllib.request.urlopen(request) as _: return True except urllib.error.HTTPError: return False diff --git a/tests/conftest.py b/tests/conftest.py index 0df03dc..d581c62 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,6 +1,5 @@ import pytest -import os, shutil -from git import Repo + from lean_dojo import * @@ -8,6 +7,8 @@ AESOP_URL = "https://github.com/leanprover-community/aesop" MATHLIB4_URL = "https://github.com/leanprover-community/mathlib4" LEAN4_EXAMPLE_URL = "https://github.com/yangky11/lean4-example" +EXAMPLE_COMMIT_HASH = "3f8c5eb303a225cdef609498b8d87262e5ef344b" +REMOTE_EXAMPLE_URL = "https://gitee.com/rexzong/lean4-example" URLS = [ BATTERIES_URL, AESOP_URL, @@ -15,35 +16,21 @@ LEAN4_EXAMPLE_URL, ] -EXAMPLE_COMMIT_HASH = "3f8c5eb303a225cdef609498b8d87262e5ef344b" -REMOTE_EXAMPLE_URL = "https://gitee.com/rexzong/lean4-example" -LOCAL_TEST_PATH = f"{os.path.dirname(__file__)}/testdata" @pytest.fixture(scope="session") -def clean_clone_and_checkout(): - def _clean_clone_and_checkout(repo_url, local_path, label='main'): - if os.path.exists(local_path): - shutil.rmtree(local_path) - repo = Repo.clone_from(repo_url, local_path) - repo.git.checkout(label) - return repo - return _clean_clone_and_checkout +def remote_example_url(): + return REMOTE_EXAMPLE_URL -@pytest.fixture(scope="session") -def lean4_example_url(): - return LEAN4_EXAMPLE_URL @pytest.fixture(scope="session") def example_commit_hash(): return EXAMPLE_COMMIT_HASH -@pytest.fixture(scope="session") -def remote_example_url(): - return REMOTE_EXAMPLE_URL @pytest.fixture(scope="session") -def local_test_path(): - return LOCAL_TEST_PATH +def lean4_example_url(): + return LEAN4_EXAMPLE_URL + @pytest.fixture(scope="session") def monkeysession(): diff --git a/tests/data_extraction/test_cache.py b/tests/data_extraction/test_cache.py new file mode 100644 index 0000000..4741239 --- /dev/null +++ b/tests/data_extraction/test_cache.py @@ -0,0 +1,39 @@ +# test for cache manager +from git import Repo +from lean_dojo.utils import working_directory +from pathlib import Path +from lean_dojo.data_extraction.cache import cache + + +def test_repo_cache(lean4_example_url, remote_example_url, example_commit_hash): + # Note: The `git.Repo` requires the local repo to be cloned in a directory + # all cached repos are stored in CACHE_DIR/repos + prefix = "repos" + repo_name = "lean4-example" + + # test local repo cache + with working_directory() as tmp_dir: + repo = Repo.clone_from(lean4_example_url, repo_name) + repo.git.checkout(example_commit_hash) + local_dir = tmp_dir / repo_name + rel_cache_dir = ( + prefix / Path(f"gitpython-{repo_name}-{example_commit_hash}") / repo_name + ) + cache.store(local_dir, rel_cache_dir) + # get the cache + repo_cache_dir = cache.get(rel_cache_dir) + assert repo_cache_dir is not None + + # test remote repo cache + with working_directory() as tmp_dir: + repo = Repo.clone_from(remote_example_url, repo_name) + tmp_remote_dir = tmp_dir / repo_name + rel_cache_dir = ( + prefix + / Path(f"gitpython-{repo_name}-{repo.head.commit.hexsha}") + / repo_name + ) + cache.store(tmp_remote_dir, rel_cache_dir) + # get the cache + repo_cache = cache.get(rel_cache_dir) + assert repo_cache is not None diff --git a/tests/data_extraction/test_lean_repo.py b/tests/data_extraction/test_lean_repo.py new file mode 100644 index 0000000..d9763a7 --- /dev/null +++ b/tests/data_extraction/test_lean_repo.py @@ -0,0 +1,133 @@ +# test for the class `LeanGitRepo` +from lean_dojo import LeanGitRepo +from git import Repo +from github.Repository import Repository +from lean_dojo.utils import working_directory +from lean_dojo.data_extraction.lean import ( + _to_commit_hash, + repo_type_of_url, + url_to_repo, + get_latest_commit, + is_commit_hash, + GITHUB, + LEAN4_REPO, +) + + +def test_github_type(lean4_example_url, example_commit_hash): + repo_name = "lean4-example" + + ## get_latest_commit + gh_cm_hash = get_latest_commit(lean4_example_url) + assert is_commit_hash(gh_cm_hash) + + ## url_to_repo & repo_type_of_url + github_repo = url_to_repo(lean4_example_url) + assert repo_type_of_url(lean4_example_url) == "github" + assert repo_type_of_url("git@github.com:yangky11/lean4-example.git") == "github" + assert repo_type_of_url("git@github.com:yangky11/lean4-example") == "github" + assert isinstance(github_repo, Repository) + assert github_repo.name == repo_name + + ## commit hash + assert _to_commit_hash(github_repo, example_commit_hash) == example_commit_hash + ### test branch, assume this branch is not changing + assert ( + _to_commit_hash(github_repo, "paper") + == "8bf74cf67d1acf652a0c74baaa9dc3b9b9e4098c" + ) + ### test git tag + assert ( + _to_commit_hash(GITHUB.get_repo("leanprover/lean4"), "v4.9.1") + == "1b78cb4836cf626007bd38872956a6fab8910993" + ) + + ## LeanGitRepo + repo = LeanGitRepo(lean4_example_url, example_commit_hash) + assert repo.url == lean4_example_url + assert repo.repo_type == "github" + assert repo.commit == example_commit_hash + assert repo.exists() + assert repo.name == repo_name + assert repo.lean_version == "v4.7.0" + assert repo.commit_url == f"{lean4_example_url}/tree/{example_commit_hash}" + # cache name + assert isinstance(repo.repo, Repository) + assert str(repo.format_dirname) == f"yangky11-{repo_name}-{example_commit_hash}" + + +def test_remote_type(remote_example_url, example_commit_hash): + repo_name = "lean4-example" + + remote_repo = url_to_repo(remote_example_url) + assert repo_type_of_url(remote_example_url) == "remote" + assert isinstance(remote_repo, Repo) + re_cm_hash = get_latest_commit(remote_example_url) + assert re_cm_hash == get_latest_commit(str(remote_repo.working_dir)) + assert _to_commit_hash(remote_repo, example_commit_hash) == example_commit_hash + assert ( + _to_commit_hash(remote_repo, "origin/paper") + == "8bf74cf67d1acf652a0c74baaa9dc3b9b9e4098c" + ) + + ## LeanGitRepo + repo = LeanGitRepo(remote_example_url, example_commit_hash) + assert repo.url == remote_example_url + assert repo.repo_type == "remote" + assert repo.commit == example_commit_hash + assert repo.exists() + assert repo.name == repo_name + assert repo.lean_version == "v4.7.0" + assert repo.commit_url == f"{remote_example_url}/tree/{example_commit_hash}" + # cache name + assert isinstance(repo.repo, Repo) + assert str(repo.format_dirname) == f"gitpython-{repo_name}-{example_commit_hash}" + + +def test_local_type(lean4_example_url, example_commit_hash): + repo_name = "lean4-example" + gh_cm_hash = get_latest_commit(lean4_example_url) + + with working_directory() as tmp_dir: + # git repo placed in `tmp_dir / repo_name` + Repo.clone_from(lean4_example_url, repo_name) + + ## get_latest_commit + local_url = str((tmp_dir / repo_name).absolute()) + assert get_latest_commit(local_url) == gh_cm_hash + + ## url_to_repo & repo_type_of_url + local_repo = url_to_repo(local_url, repo_type="local") + assert repo_type_of_url(local_url) == "local" + assert isinstance(local_repo, Repo) + assert ( + local_repo.working_dir != local_url + ), "The working directory should not be the same as the original repo" + + ## commit hash + repo = Repo(local_url) + repo.git.checkout(example_commit_hash) + repo.create_tag("v0.1.0") # create a tag for the example commit hash + repo.git.checkout("main") # switch back to main branch + assert _to_commit_hash(repo, example_commit_hash) == example_commit_hash + assert ( + _to_commit_hash(repo, "origin/paper") + == "8bf74cf67d1acf652a0c74baaa9dc3b9b9e4098c" + ) + assert _to_commit_hash(repo, "v0.1.0") == example_commit_hash + + ## LeanGitRepo + repo = LeanGitRepo(local_url, example_commit_hash) + repo2 = LeanGitRepo.from_path(local_url) # test from_path + assert repo.url == local_url == repo2.url + assert repo.repo_type == "local" == repo2.repo_type + assert repo.commit == example_commit_hash and repo2.commit == gh_cm_hash + assert repo.exists() and repo2.exists() + assert repo.name == repo_name == repo2.name + assert repo.lean_version == "v4.7.0" + # cache name + assert isinstance(repo.repo, Repo) and isinstance(repo2.repo, Repo) + assert ( + str(repo.format_dirname) == f"gitpython-{repo_name}-{example_commit_hash}" + ) + assert str(repo2.format_dirname) == f"gitpython-{repo_name}-{gh_cm_hash}" diff --git a/tests/data_extraction/test_repo_type.py b/tests/data_extraction/test_repo_type.py deleted file mode 100644 index c566856..0000000 --- a/tests/data_extraction/test_repo_type.py +++ /dev/null @@ -1,131 +0,0 @@ -import pytest -from lean_dojo import LeanGitRepo -import os -from lean_dojo.data_extraction.lean import url_to_repo, _to_commit_hash -from github import Repository -from git import Repo -from lean_dojo.utils import repo_type_of_url, get_repo_info -from pathlib import Path - -def test_repo_types(clean_clone_and_checkout, local_test_path, lean4_example_url, remote_example_url, example_commit_hash): - # init: local repo - local_repo_path = os.path.join(local_test_path, 'lean4-example') - clean_clone_and_checkout(lean4_example_url, local_repo_path, 'main') - url, _ = get_repo_info(Path(local_repo_path)) - assert url == local_repo_path - - # test repo type of url - assert repo_type_of_url(lean4_example_url) == 'github' - assert repo_type_of_url(remote_example_url) == 'remote' - assert repo_type_of_url(local_repo_path) == 'local' - - # test url to repo - ## test github url - github_repo = url_to_repo(lean4_example_url, num_retries=2) - assert isinstance(github_repo, Repository.Repository) - assert github_repo.full_name == 'yangky11/lean4-example' - - ## test remote url - remote_repo = url_to_repo(remote_example_url, repo_type='remote', num_retries=2) - assert isinstance(remote_repo, Repo) - assert remote_repo.remotes[0].url == remote_example_url - - ## test local path - local_repo = url_to_repo(local_repo_path, repo_type='local') - assert isinstance(local_repo, Repo) - assert local_repo.remotes[0].url == lean4_example_url - assert ( - local_repo.working_dir != local_repo_path - ), "working_dir should not be the same as local_repo_path to avoid changing the original repo" - - # test commit hash - ## test github url - commit_hash = _to_commit_hash(github_repo, 'main') - assert len(commit_hash) == 40 - commit_hash = _to_commit_hash(github_repo, example_commit_hash) - assert commit_hash == example_commit_hash - - ## test remote url - commit_hash = _to_commit_hash(remote_repo, 'main') - assert len(commit_hash) == 40 - commit_hash = _to_commit_hash(remote_repo, example_commit_hash) - assert commit_hash == example_commit_hash - - ## test local path - commit_hash = _to_commit_hash(local_repo, 'main') - assert len(commit_hash) == 40 - commit_hash = _to_commit_hash(local_repo, example_commit_hash) - assert commit_hash == example_commit_hash - - -def test_local_with_branch(clean_clone_and_checkout, lean4_example_url, local_test_path): - # Initialize GitHub repo - github_repo = LeanGitRepo(url=lean4_example_url, commit="main") - - # Initialize local repo - local_repo_path = os.path.join(local_test_path, 'lean4-example') - clean_clone_and_checkout(lean4_example_url, local_repo_path, 'main') - local_repo = LeanGitRepo(url=local_repo_path, commit="main") - from_path_repo = LeanGitRepo.from_path(local_repo_path) - - # Check if commit hashes match - assert github_repo.commit == local_repo.commit == from_path_repo.commit - assert github_repo.lean_version == local_repo.lean_version == from_path_repo.lean_version - - # check the repo type - assert github_repo.repo_type == 'github' - assert local_repo.repo_type == 'local' - assert from_path_repo.repo_type == 'local' - - # check repo name - assert github_repo.name == 'lean4-example' == local_repo.name == from_path_repo.name - - # test get_config - assert github_repo.get_config('lean-toolchain') == {'content': 'leanprover/lean4:v4.9.0-rc3\n'} - assert local_repo.get_config('lean-toolchain') == {'content': 'leanprover/lean4:v4.9.0-rc3\n'} - assert from_path_repo.get_config('lean-toolchain') == {'content': 'leanprover/lean4:v4.9.0-rc3\n'} - -def test_local_with_commit(clean_clone_and_checkout, lean4_example_url, local_test_path, example_commit_hash): - # Initialize GitHub repo - github_repo = LeanGitRepo(url=lean4_example_url, commit=example_commit_hash) - - # Initialize local repo - local_repo_path = os.path.join(local_test_path, 'lean4-example') - clean_clone_and_checkout(lean4_example_url, local_repo_path, 'main') # use main branch - local_repo = LeanGitRepo(url=local_repo_path, commit=example_commit_hash) # checkout to commit hash - - # Check if commit hashes match - assert github_repo.commit == local_repo.commit - assert github_repo.lean_version == local_repo.lean_version - - # check the repo type - assert github_repo.repo_type == 'github' - assert local_repo.repo_type == 'local' - - # check repo name - assert github_repo.name == 'lean4-example' == local_repo.name - - # test get_config - assert github_repo.get_config('lean-toolchain') == {'content': 'leanprover/lean4:v4.9.0-rc3\n'} - assert local_repo.get_config('lean-toolchain') == {'content': 'leanprover/lean4:v4.9.0-rc3\n'} - -def test_remote_url(lean4_example_url, remote_example_url, example_commit_hash): - # Initialize GitHub repo - github_repo = LeanGitRepo(url=lean4_example_url, commit=example_commit_hash) - # Initialize Gitee repo - _ = LeanGitRepo(url=remote_example_url, commit="main") # get commit by branch - gitee_repo = LeanGitRepo(url=remote_example_url, commit=example_commit_hash) - - # Check if commit hashes match - assert github_repo.commit == gitee_repo.commit - assert github_repo.lean_version == gitee_repo.lean_version - - # check the repo type - assert gitee_repo.repo_type == 'remote' - - # check repo name - assert gitee_repo.name == 'lean4-example' == github_repo.name - - # test get_config - assert gitee_repo.get_config('lean-toolchain') == {'content': 'leanprover/lean4:v4.9.0-rc3\n'} - assert github_repo.get_config('lean-toolchain') == {'content': 'leanprover/lean4:v4.9.0-rc3\n'} diff --git a/tests/data_extraction/test_trace.py b/tests/data_extraction/test_trace.py index 541060c..24df62f 100644 --- a/tests/data_extraction/test_trace.py +++ b/tests/data_extraction/test_trace.py @@ -1,48 +1,48 @@ -import pytest +from pathlib import Path from lean_dojo import * +from lean_dojo.data_extraction.cache import cache +from lean_dojo.utils import working_directory +from lean_dojo.data_extraction.lean import url_to_repo from git import Repo -import os, shutil -from pathlib import Path -# Avoid using remote cache -os.environ['DISABLE_REMOTE_CACHE'] = 'true' -@pytest.fixture(scope="session") -def local_trace_dir(local_test_path): - return os.path.join(local_test_path, 'lean4-example-traced') +def test_github_trace(lean4_example_url): + # github + github_repo = LeanGitRepo(lean4_example_url, "main") + assert github_repo.repo_type == "github" + trace_repo = trace(github_repo) + path = cache.get(github_repo.format_dirname / github_repo.name) + assert path is not None + + +def test_remote_trace(remote_example_url): + # remote + remote_repo = LeanGitRepo(remote_example_url, "main") + assert remote_repo.repo_type == "remote" + trace_repo = trace(remote_repo) + path = cache.get(remote_repo.format_dirname / remote_repo.name) + assert path is not None + + +def test_local_trace(lean4_example_url): + # local + with working_directory() as tmp_dir: + # git repo placed in `tmp_dir / repo_name` + Repo.clone_from(lean4_example_url, "lean4-example") + local_dir = str((tmp_dir / "lean4-example")) + local_url = str((tmp_dir / "lean4-example").absolute()) + local_repo = LeanGitRepo(local_dir, "main") + assert local_repo.url == local_url + assert local_repo.repo_type == "local" + trace_repo = trace(local_repo) + path = cache.get(local_repo.format_dirname / local_repo.name) + assert path is not None -def test_remote_trace(remote_example_url, local_trace_dir): - remote_repo = LeanGitRepo(url=remote_example_url, commit="main") - assert remote_repo.repo_type == 'remote' - if os.path.exists(local_trace_dir): - shutil.rmtree(local_trace_dir) - traced_repo = trace(remote_repo, local_trace_dir) - traced_repo.check_sanity() - assert traced_repo.repo.repo_type == 'local' - -def test_local_trace(clean_clone_and_checkout, lean4_example_url, local_test_path, local_trace_dir): - local_repo_path = os.path.join(local_test_path, 'lean4-example') - clean_clone_and_checkout(lean4_example_url, local_repo_path) - local_repo = LeanGitRepo(url=local_repo_path, commit="main") - assert local_repo.repo_type == 'local' - if os.path.exists(local_trace_dir): - shutil.rmtree(local_trace_dir) - traced_repo = trace(local_repo, local_trace_dir) - traced_repo.check_sanity() - assert traced_repo.repo.repo_type == 'local' - -def test_github_trace(lean4_example_url, local_trace_dir): - remote_repo = LeanGitRepo(url=lean4_example_url, commit="main") - assert remote_repo.repo_type == 'github' - if os.path.exists(local_trace_dir): - shutil.rmtree(local_trace_dir) - traced_repo = trace(remote_repo, local_trace_dir) - traced_repo.check_sanity() - assert traced_repo.repo.repo_type == 'local' def test_trace(traced_repo): traced_repo.check_sanity() + def test_get_traced_repo_path(mathlib4_repo): path = get_traced_repo_path(mathlib4_repo) assert isinstance(path, Path) and path.exists() diff --git a/tests/interaction/test_interaction.py b/tests/interaction/test_interaction.py new file mode 100644 index 0000000..3eff4d9 --- /dev/null +++ b/tests/interaction/test_interaction.py @@ -0,0 +1,64 @@ +from lean_dojo import LeanGitRepo, Dojo, ProofFinished, ProofGivenUp, Theorem +from lean_dojo.utils import working_directory +from git import Repo +import os + +# Avoid using remote cache +os.environ["DISABLE_REMOTE_CACHE"] = "true" + + +def test_github_interact(lean4_example_url): + repo = LeanGitRepo(url=lean4_example_url, commit="main") + assert repo.repo_type == "github" + theorem = Theorem(repo, "Lean4Example.lean", "hello_world") + # initial state + dojo, state_0 = Dojo(theorem).__enter__() + assert state_0.pp == "a b c : Nat\n⊢ a + b + c = a + c + b" + # state after running a tactic + state_1 = dojo.run_tac(state_0, "rw [add_assoc]") + assert state_1.pp == "a b c : Nat\n⊢ a + (b + c) = a + c + b" + # state after running another a sorry tactic + assert dojo.run_tac(state_1, "sorry") == ProofGivenUp() + # finish proof + final_state = dojo.run_tac(state_1, "rw [add_comm b, ←add_assoc]") + assert isinstance(final_state, ProofFinished) + + +def test_remote_interact(remote_example_url): + repo = LeanGitRepo(url=remote_example_url, commit="main") + assert repo.repo_type == "remote" + theorem = Theorem(repo, "Lean4Example.lean", "hello_world") + # initial state + dojo, state_0 = Dojo(theorem).__enter__() + assert state_0.pp == "a b c : Nat\n⊢ a + b + c = a + c + b" + # state after running a tactic + state_1 = dojo.run_tac(state_0, "rw [add_assoc]") + assert state_1.pp == "a b c : Nat\n⊢ a + (b + c) = a + c + b" + # state after running another a sorry tactic + assert dojo.run_tac(state_1, "sorry") == ProofGivenUp() + # finish proof + final_state = dojo.run_tac(state_1, "rw [add_comm b, ←add_assoc]") + assert isinstance(final_state, ProofFinished) + + +def test_local_interact(lean4_example_url): + # Clone the GitHub repository to the local path + with working_directory() as tmp_dir: + # git repo placed in `tmp_dir / repo_name` + Repo.clone_from(lean4_example_url, "lean4-example") + + local_dir = str((tmp_dir / "lean4-example")) + repo = LeanGitRepo(local_dir, commit="main") + assert repo.repo_type == "local" + theorem = Theorem(repo, "Lean4Example.lean", "hello_world") + # initial state + dojo, state_0 = Dojo(theorem).__enter__() + assert state_0.pp == "a b c : Nat\n⊢ a + b + c = a + c + b" + # state after running a tactic + state_1 = dojo.run_tac(state_0, "rw [add_assoc]") + assert state_1.pp == "a b c : Nat\n⊢ a + (b + c) = a + c + b" + # state after running another a sorry tactic + assert dojo.run_tac(state_1, "sorry") == ProofGivenUp() + # finish proof + final_state = dojo.run_tac(state_1, "rw [add_comm b, ←add_assoc]") + assert isinstance(final_state, ProofFinished) diff --git a/tests/interaction/test_nongithub_repo.py b/tests/interaction/test_nongithub_repo.py deleted file mode 100644 index 76078df..0000000 --- a/tests/interaction/test_nongithub_repo.py +++ /dev/null @@ -1,42 +0,0 @@ -import pytest -from lean_dojo import * -from git import Repo -import os - -# Avoid using remote cache -os.environ['DISABLE_REMOTE_CACHE'] = 'true' - -def test_remote_interact(remote_example_url): - repo = LeanGitRepo(url=remote_example_url, commit="main") - assert repo.repo_type == 'remote' - theorem = Theorem(repo, "Lean4Example.lean", "hello_world") - # initial state - dojo, state_0 = Dojo(theorem).__enter__() - assert state_0.pp == 'a b c : Nat\n⊢ a + b + c = a + c + b' - # state after running a tactic - state_1 = dojo.run_tac(state_0, "rw [add_assoc]") - assert state_1.pp == 'a b c : Nat\n⊢ a + (b + c) = a + c + b' - # state after running another a sorry tactic - assert dojo.run_tac(state_1, "sorry") == ProofGivenUp() - # finish proof - final_state = dojo.run_tac(state_1, "rw [add_comm b, ←add_assoc]") - assert isinstance(final_state, ProofFinished) - -def test_local_interact(clean_clone_and_checkout, lean4_example_url, local_test_path): - # Clone the GitHub repository to the local path - local_repo_path = os.path.join(local_test_path, 'lean4-example') - clean_clone_and_checkout(lean4_example_url, local_repo_path) - repo = LeanGitRepo(url=local_repo_path, commit="main") - assert repo.repo_type == 'local' - theorem = Theorem(repo, "Lean4Example.lean", "hello_world") - # initial state - dojo, state_0 = Dojo(theorem).__enter__() - assert state_0.pp == 'a b c : Nat\n⊢ a + b + c = a + c + b' - # state after running a tactic - state_1 = dojo.run_tac(state_0, "rw [add_assoc]") - assert state_1.pp == 'a b c : Nat\n⊢ a + (b + c) = a + c + b' - # state after running another a sorry tactic - assert dojo.run_tac(state_1, "sorry") == ProofGivenUp() - # finish proof - final_state = dojo.run_tac(state_1, "rw [add_comm b, ←add_assoc]") - assert isinstance(final_state, ProofFinished)