Skip to content

Commit

Permalink
PSCE-408 refactor: replaces regex with urllib for repo URL parsing (#215
Browse files Browse the repository at this point in the history
)

* refactor: replaces regex with urllib for repo URL parsing

Using a dedicated library for url parsing is generally less error
prone and easier to maintain

Signed-off-by: Jennifer Power <[email protected]>

* fix: imports annotations from __future__

Resolves type errors in python 3.8

Signed-off-by: Jennifer Power <[email protected]>

* refactor: use assignment in expression in match_url

Signed-off-by: Jennifer Power <[email protected]>

---------

Signed-off-by: Jennifer Power <[email protected]>
  • Loading branch information
jpower432 committed May 7, 2024
1 parent 10421a3 commit 87242b9
Show file tree
Hide file tree
Showing 4 changed files with 56 additions and 16 deletions.
2 changes: 1 addition & 1 deletion tests/trestlebot/test_gitlab.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def test_parse_repository_with_incorrect_name() -> None:
gl = GitLab("fake")
with pytest.raises(
GitProviderException,
match="https://notgitlab.com/owner/repo.git is an invalid repo URL",
match="https://notgitlab.com/owner/repo.git is an invalid Gitlab repo URL",
):
gl.parse_repository("https://notgitlab.com/owner/repo.git")

Expand Down
19 changes: 15 additions & 4 deletions trestlebot/github.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

"""GitHub related functions for the Trestle Bot."""

from __future__ import annotations

import os
import re
from typing import Optional, Tuple
Expand Down Expand Up @@ -31,7 +33,15 @@ def __init__(self, access_token: str):
session.login(token=access_token)

self._session = session
self.pattern = r"^(?:https?://)?github\.com/([^/]+)/([^/.]+)"

# For repo URL input validation
pattern = r"^(?:https?://)?github\.com/([^/]+)/([^/.]+)"
self._pattern = re.compile(pattern)

@property
def provider_pattern(self) -> re.Pattern[str]:
"""Regex pattern to validate repository URLs"""
return self._pattern

def parse_repository(self, repo_url: str) -> Tuple[str, str]:
"""
Expand All @@ -43,11 +53,12 @@ def parse_repository(self, repo_url: str) -> Tuple[str, str]:
Returns:
Owner and repo name in a tuple, respectively
"""

match = re.match(self.pattern, repo_url)
match: Optional[re.Match[str]]
stripped_url: str
match, stripped_url = self.match_url(repo_url)

if not match:
raise GitProviderException(f"{repo_url} is an invalid GitHub repo URL")
raise GitProviderException(f"{stripped_url} is an invalid GitHub repo URL")

owner = match.group(1)
repo = match.group(2)
Expand Down
28 changes: 18 additions & 10 deletions trestlebot/gitlab.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,13 @@

"""GitLab related functions for the Trestle Bot."""

from __future__ import annotations

import os
import re
import time
from typing import Tuple
from typing import Optional, Tuple
from urllib.parse import ParseResult, urlparse

import gitlab

Expand All @@ -21,10 +24,16 @@ def __init__(self, api_token: str, server_url: str = "https://gitlab.com"):

self._gitlab_client = gitlab.Gitlab(server_url, private_token=api_token)

stripped_url = re.sub(r"^(https?://)?", "", server_url)
self.pattern = r"^(?:https?://)?{0}(/.+)/([^/.]+)(\.git)?$".format(
re.escape(stripped_url)
)
# For repo URL input validation
parsed_url: ParseResult = urlparse(server_url)
stripped_url = f"{parsed_url.netloc}{parsed_url.path}"
pattern = rf"^(?:https?://)?{re.escape(stripped_url)}(/.+)/([^/.]+)(\.git)?$"
self._pattern = re.compile(pattern)

@property
def provider_pattern(self) -> re.Pattern[str]:
"""Regex pattern to validate repository URLs"""
return self._pattern

def parse_repository(self, repo_url: str) -> Tuple[str, str]:
"""
Expand All @@ -37,13 +46,12 @@ def parse_repository(self, repo_url: str) -> Tuple[str, str]:
Owner and project name in a tuple, respectively
"""

# Strip out any basic auth
stripped_url = re.sub(r"https?://.*?@", "https://", repo_url)

match = re.match(self.pattern, stripped_url)
match: Optional[re.Match[str]]
stripped_url: str
match, stripped_url = self.match_url(repo_url)

if not match:
raise GitProviderException(f"{stripped_url} is an invalid repo URL")
raise GitProviderException(f"{stripped_url} is an invalid Gitlab repo URL")

owner = match.group(1)[1:] # Removing the leading slash
repo = match.group(2)
Expand Down
23 changes: 22 additions & 1 deletion trestlebot/provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,12 @@

"""Base Git Provider class for the Trestle Bot."""

from __future__ import annotations

import re
from abc import ABC, abstractmethod
from typing import Tuple
from typing import Optional, Tuple
from urllib.parse import ParseResult, urlparse


class GitProviderException(Exception):
Expand All @@ -17,6 +21,23 @@ class GitProvider(ABC):
Abstract base class for Git provider types
"""

@property
@abstractmethod
def provider_pattern(self) -> re.Pattern[str]:
"""Regex pattern to validate repository URLs"""

def match_url(self, repo_url: str) -> Tuple[Optional[re.Match[str]], str]:
"""Match a repository URL with the pattern"""
parsed_url: ParseResult = urlparse(repo_url)

path = parsed_url.path
stripped_url = path
if host := parsed_url.hostname:
stripped_url = f"{host}{path}"
if scheme := parsed_url.scheme:
stripped_url = f"{scheme}://{stripped_url}"
return self.provider_pattern.match(stripped_url), stripped_url

@abstractmethod
def parse_repository(self, repository_url: str) -> Tuple[str, str]:
"""Parse repository information into namespace and repo, respectively"""
Expand Down

0 comments on commit 87242b9

Please sign in to comment.