Skip to content

Commit

Permalink
Add metadata support for GitHub packages
Browse files Browse the repository at this point in the history
- The github support makes use of GitHub GraphQL API
- Before fetching metadata of github package add GitHub API in `.env` file. Example: `GH_TOKEN=your-github-api-key`

Signed-off-by: Keshav Priyadarshi <[email protected]>
  • Loading branch information
keshav-space committed Mar 25, 2024
1 parent 5f92d78 commit 76ba556
Show file tree
Hide file tree
Showing 3 changed files with 240 additions and 62 deletions.
72 changes: 13 additions & 59 deletions src/fetchcode/package.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,15 @@
from urllib.parse import urljoin

import htmllistparse
import requests
from packageurl import PackageURL
from packageurl.contrib.route import NoRouteAvailable
from packageurl.contrib.route import Router

from fetchcode.ipkg_release_info import IPKG_RELEASES
from fetchcode.package_util import GITHUB_SOURCE_BY_PACKAGE
from fetchcode.package_util import IPKG_RELEASES
from fetchcode.package_util import GitHubSource
from fetchcode.packagedcode_models import Package
from fetchcode.utils import get_response

router = Router()

Expand All @@ -44,17 +46,6 @@ def info(url):
return


def get_response(url):
"""
Generate `Package` object for a `url` string
"""
resp = requests.get(url)
if resp.status_code == 200:
return resp.json()

raise Exception(f"Failed to fetch: {url}")


def get_pypi_bugtracker_url(project_urls):
bug_tracking_url = project_urls.get("Tracker")
if not (bug_tracking_url):
Expand Down Expand Up @@ -216,53 +207,16 @@ def get_pypi_data_from_purl(purl):
@router.route("pkg:github/.*")
def get_github_data_from_purl(purl):
"""
Generate `Package` object from the `purl` string of github type
Yield `Package` object from the `purl` string of github type
"""
purl = PackageURL.from_string(purl)
name = purl.name
namespace = purl.namespace
base_path = "https://api.github.com/repos"
api_url = f"{base_path}/{namespace}/{name}"
response = get_response(api_url)
homepage_url = response.get("homepage")
vcs_url = response.get("git_url")
github_url = "https://github.com"
bug_tracking_url = f"{github_url}/{namespace}/{name}/issues"
code_view_url = f"{github_url}/{namespace}/{name}"
license_data = response.get("license") or {}
declared_license = license_data.get("spdx_id")
primary_language = response.get("language")
yield Package(
homepage_url=homepage_url,
vcs_url=vcs_url,
api_url=api_url,
bug_tracking_url=bug_tracking_url,
code_view_url=code_view_url,
declared_license=declared_license,
primary_language=primary_language,
**purl.to_dict(),
)
release_url = f"{api_url}/releases"
releases = get_response(release_url)
for release in releases:
version = release.get("name")
version_purl = PackageURL(
type=purl.type, namespace=namespace, name=name, version=version
)
download_url = release.get("tarball_url")
code_view_url = f"{github_url}/{namespace}/{name}/tree/{version}"
version_vcs_url = f"{vcs_url}@{version}"
yield Package(
homepage_url=homepage_url,
vcs_url=version_vcs_url,
api_url=api_url,
bug_tracking_url=bug_tracking_url,
code_view_url=code_view_url,
declared_license=declared_license,
primary_language=primary_language,
download_url=download_url,
**version_purl.to_dict(),
)

gh_package = f"{namespace}/{name}"
gh_source_class = GITHUB_SOURCE_BY_PACKAGE.get(gh_package, GitHubSource)

return gh_source_class.get_package_info(purl)


@router.route("pkg:bitbucket/.*")
Expand Down Expand Up @@ -399,11 +353,11 @@ def get_package_info(cls, package_url):

version = package_url.version
if version and version in IPKG_RELEASES:
archive = IPKG_RELEASES[version]
archives = IPKG_RELEASES[version]
yield Package(
homepage_url=cls.source_url,
download_url=archive["url"],
release_date=archive["date"],
download_url=archives["url"],
release_date=archives["date"],
**package_url.to_dict(),
)

Expand Down
214 changes: 214 additions & 0 deletions src/fetchcode/ipkg_release_info.py → src/fetchcode/package_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,220 @@

# Since there will be no new releases of ipkg, it's better to
# store them in a dictionary rather than fetching them every time.

import dataclasses
import re

import attr

from fetchcode import utils
from fetchcode.packagedcode_models import Package


def package_from_dict(package_data):
"""
Return a Package built from a `package_data` mapping.
Ignore unknown and unsupported fields.
"""
supported = {attr.name for attr in attr.fields(Package)}
cleaned_package_data = {
key: value for key, value in package_data.items() if key in supported
}
return Package(**cleaned_package_data)


@dataclasses.dataclass
class GitHubSource:
version_regex: re.Pattern = dataclasses.field(
default=None,
metadata={
"help_text": "Regular expression pattern to match and extract version from tag."
},
)
ignored_tag_regex: re.Pattern = dataclasses.field(
default=None,
metadata={"help_text": "Regex to ignore tag."},
)

@classmethod
def get_default_package(cls, purl):
"""Return a Package object populated with default for this data source."""
name = purl.name
namespace = purl.namespace
base_path = "https://api.github.com/repos"
api_url = f"{base_path}/{namespace}/{name}"
response = utils.get_response(api_url)
homepage_url = response.get("homepage")
vcs_url = response.get("git_url")
github_url = "https://github.com"
bug_tracking_url = f"{github_url}/{namespace}/{name}/issues"
code_view_url = f"{github_url}/{namespace}/{name}"
license_data = response.get("license") or {}
declared_license = license_data.get("spdx_id")
primary_language = response.get("language")
return Package(
homepage_url=homepage_url,
vcs_url=vcs_url,
api_url=api_url,
bug_tracking_url=bug_tracking_url,
code_view_url=code_view_url,
declared_license=declared_license,
primary_language=primary_language,
**purl.to_dict(),
)

@classmethod
def get_package_info(cls, package_url):
yield from get_github_packages(
package_url,
cls.version_regex,
cls.ignored_tag_regex,
cls.get_default_package(package_url),
)


def get_github_packages(purl, version_regex, ignored_tag_regex, default_package):
"""
Yield package data from a directory listing for the given source_archive_url.
"""
for package in _get_github_packages(
purl, version_regex, ignored_tag_regex, default_package
):
# Don't yield all packages when a specific version is requested.
if purl.version and package.version != purl.version:
continue

yield package

# If a version is specified in purl and we have found a matching package,
# we don't need to continue searching.
if purl.version:
break


def _get_github_packages(purl, version_regex, ignored_tag_regex, default_package):
"Yield package for GitHub purl"
archive_download_url = (
"https://github.com/{org}/{name}/archive/refs/tags/{tag_name}.tar.gz"
)

package_dict = default_package.to_dict()
for tag, date in utils.fetch_github_tags_gql(purl):
if ignored_tag_regex and ignored_tag_regex.match(tag):
continue

if version_regex:
match = version_regex.match(tag)
if not match:
continue
version = match.group("version")
else:
version = tag

version = version.strip("Vv").strip()
if not version:
continue

download_url = archive_download_url.format(
org=purl.namespace, name=purl.name, tag_name=tag
)
package_dict.update(
{
"download_url": download_url,
"release_date": date,
"version": version,
}
)

yield package_from_dict(package_dict)


class UBootGitHubSource(GitHubSource):
version_regex = re.compile(r"(?P<version>v\d{4}\.\d{2})(?![\w.-])")
ignored_tag_regex = None


class Genext2fsGitHubSource(GitHubSource):
version_regex = None
ignored_tag_regex = re.compile(r"debian_version\S+upstream_version\S+")


class SquashfsToolsGitHubSource(GitHubSource):
version_regex = re.compile(r"\b[vV]?(?P<version>(?:\d+(\.\d+){1,2}))\b")
ignored_tag_regex = None


class PupnpGitHubSource(GitHubSource):
version_regex = re.compile(r"\brelease-?(?P<version>(?:\d+(\.\d+){1,2}))\b")
ignored_tag_regex = None


class BrotliGitHubSource(GitHubSource):
version_regex = re.compile(r"\b[vV]?(?P<version>(?:\d+(\.\d+){1,2}))\b")
ignored_tag_regex = None


class BpftoolGitHubSource(GitHubSource):
version_regex = re.compile(r"\b[vV]?(?P<version>(?:\d+(\.\d+){1,2}))\b")
ignored_tag_regex = None


class SqliteGitHubSource(GitHubSource):
version_regex = re.compile(r"\bversion-?(?P<version>(?:\d+(\.\d+){1,2}))\b")
ignored_tag_regex = None


class LlvmGitHubSource(GitHubSource):
version_regex = re.compile(r"llvmorg-(?P<version>.+)")
ignored_tag_regex = None


GITHUB_SOURCE_BY_PACKAGE = {
"u-boot/u-boot": UBootGitHubSource,
"dosfstools/dosfstools": GitHubSource,
"bestouff/genext2fs": Genext2fsGitHubSource,
"plougher/squashfs-tools": SquashfsToolsGitHubSource,
"avahi/avahi": GitHubSource,
"inotify-tools/inotify-tools": GitHubSource,
"hewlettpackard/wireless-tools": GitHubSource,
"shadow-maint/shadow": GitHubSource,
"pupnp/pupnp": PupnpGitHubSource,
"google/brotli": BrotliGitHubSource,
"libbpf/bpftool": BpftoolGitHubSource,
"sqlite/sqlite": SqliteGitHubSource,
"llvm/llvm-project": LlvmGitHubSource,
"nixos/nix": GitHubSource,
}


class MiniupnpGitHubSource(GitHubSource):
version_regex = None
ignored_tag_regex = None
version_regex_template = r"{}_(?P<version>.+)"

@classmethod
def get_package_info(cls, gh_purl, package_name):
cls.version_regex = re.compile(
cls.version_regex_template.format(re.escape(package_name))
)

packages = get_github_packages(
gh_purl,
cls.version_regex,
cls.ignored_tag_regex,
cls.get_default_package(gh_purl),
)

for package in packages:
package_dict = package.to_dict()
package_dict["namespace"] = None
package_dict["name"] = package_name
package_dict["type"] = "generic"
package_dict["version"] = package_dict["version"].replace("_", ".")

yield package_from_dict(package_dict)


IPKG_RELEASES = {
"0.99.88": {
"url": "https://web.archive.org/web/20090326020239/http:/handhelds.org/download/packages/ipkg/ipkg-0.99.88.tar.gz",
Expand Down
16 changes: 13 additions & 3 deletions src/fetchcode/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,12 @@
# http://apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software distributed
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
# CONDITIONS OF ANY KIND, either express or implied. See the License for the
# CONDITIONS OF ANY KIND, either express or implied. See tshe License for the
# specific language governing permissions and limitations under the License.


import os
from dateutil import parser as dateparser
import requests
from dateutil import parser as dateparser


def fetch_github_tags_gql(purl):
Expand Down Expand Up @@ -144,3 +143,14 @@ def github_response(graphql_query):
raise GraphQLError(errors)

return response


def get_response(url):
"""
Generate `Package` object for a `url` string
"""
resp = requests.get(url)
if resp.status_code == 200:
return resp.json()

raise Exception(f"Failed to fetch: {url}")

0 comments on commit 76ba556

Please sign in to comment.