diff --git a/requirements.txt b/requirements.txt index 78a6b533..c4e0b690 100644 --- a/requirements.txt +++ b/requirements.txt @@ -73,6 +73,7 @@ text-unidecode==1.3 toml==0.10.2 typecode==30.0.0 typecode-libmagic==5.39.210531 +univers==30.11.0 urllib3==1.26.9 urlpy==0.5 wcwidth==0.2.5 diff --git a/setup.cfg b/setup.cfg index 200c39cb..1a195891 100644 --- a/setup.cfg +++ b/setup.cfg @@ -59,6 +59,7 @@ install_requires = requests python-dateutil python-dotenv + univers == 30.11.0 [options.packages.find] @@ -80,4 +81,3 @@ docs = sphinx-rtd-theme>=1.0.0 sphinx-reredirects >= 0.1.2 doc8>=0.11.2 - diff --git a/src/fetchcode/package.py b/src/fetchcode/package.py index 22d50b9c..2d61aeac 100644 --- a/src/fetchcode/package.py +++ b/src/fetchcode/package.py @@ -15,15 +15,20 @@ # specific language governing permissions and limitations under the License. import dataclasses +import json +import logging +import os import re import time from typing import List from urllib.parse import urljoin import htmllistparse +from bs4 import BeautifulSoup from packageurl import PackageURL from packageurl.contrib.route import NoRouteAvailable from packageurl.contrib.route import Router +from univers import versions from fetchcode.package_util import GITHUB_SOURCE_BY_PACKAGE from fetchcode.package_util import IPKG_RELEASES @@ -33,10 +38,19 @@ from fetchcode.package_util import MiniupnpPackagesGitHubSource from fetchcode.package_util import OpenSSLGitHubSource from fetchcode.packagedcode_models import Package +from fetchcode.utils import get_complete_response +from fetchcode.utils import get_github_rest +from fetchcode.utils import get_github_rest_no_exception +from fetchcode.utils import get_hashed_path +from fetchcode.utils import get_json_response from fetchcode.utils import get_response +from fetchcode.utils import make_head_request router = Router() +LOG_FILE_LOCATION = os.path.join(os.path.expanduser("~"), "purlcli.log") +logger = logging.getLogger(__name__) + def info(url): """ @@ -362,6 +376,290 @@ def get_gnu_data_from_purl(purl): ) +@router.route("pkg:cocoapods/.*") +def get_cocoapods_data_from_purl(purl): + logging.basicConfig( + filename=LOG_FILE_LOCATION, + level=logging.WARN, + format="%(levelname)s - %(message)s", + filemode="w", + ) + input_purl = purl + purl = PackageURL.from_string(purl) + name = purl.name + version = purl.version + cocoapods_org_url = f"https://cocoapods.org/pods/{name}" + repository_homepage_url = f"https://cocoapods.org/pods/{name}" + + # This dictionary helped me monitor the values as I worked on the code. + # Only 2 key-value pairs are currently used below to define variables: + # `input_name` and `cocoapods_org_pod_name`. + + pod_summary = {} + pod_summary["input_purl"] = input_purl + pod_summary["input_name"] = name + pod_summary["cocoapods_org_url"] = cocoapods_org_url + pod_summary["repository_homepage_url"] = repository_homepage_url + pod_summary["no_github_repo"] = None + pod_summary["gh_repo_four_o_four"] = None + pod_summary["http_url"] = None + + cocoapods_org_url_head_request = make_head_request(cocoapods_org_url) + cocoapods_org_url_status_code = cocoapods_org_url_head_request.status_code + pod_summary["cocoapods_org_url_status_code"] = cocoapods_org_url_status_code + if cocoapods_org_url_status_code == 404: + logger.error(f"cocoapods_org_url not found for {name}") + return + elif cocoapods_org_url_status_code == 302: + redirect_url = cocoapods_org_url_head_request.headers['Location'] + redirect_message = f"The cocoapods.org URL {cocoapods_org_url} redirects to {redirect_url}" + logger.error(redirect_message) + print(redirect_message) + return + + cocoapods_org_response = get_complete_response(cocoapods_org_url) + if "Failed to fetch" in cocoapods_org_response: + logger.error(cocoapods_org_response) + print(cocoapods_org_response) + return + + soup = BeautifulSoup(cocoapods_org_response.text, "html.parser") + + cocoapods_org_gh_repo_owner = None + cocoapods_org_gh_repo_name = None + cocoapods_org_gh_repo_url = None + cocoapods_org_podspec_url = None + cocoapods_org_pkg_home_url = None + + for sidebar_links in (soup.find_all('ul', class_ = "links" )): + nested_links = sidebar_links.findChildren("a") + for nested_link in nested_links: + link_text = nested_link.text + link_url = nested_link['href'] + if link_text == 'Homepage': + cocoapods_org_pkg_home_url = link_url + elif link_text == 'GitHub Repo': + split_link = link_url.split('/') + cocoapods_org_gh_repo_owner = split_link[-2] + cocoapods_org_gh_repo_name = split_link[-1] + elif link_text == 'See Podspec': + cocoapods_org_podspec_url = link_url + + if cocoapods_org_gh_repo_owner and cocoapods_org_gh_repo_name: + cocoapods_org_gh_repo_url = f"https://github.com/{cocoapods_org_gh_repo_owner}/{cocoapods_org_gh_repo_name}" + cocoapods_org_gh_repo_url_head_request = make_head_request(cocoapods_org_gh_repo_url) + cocoapods_org_gh_repo_url_status_code = cocoapods_org_gh_repo_url_head_request.status_code + pod_summary["cocoapods_org_gh_repo_url_status_code"] = cocoapods_org_gh_repo_url_status_code + if cocoapods_org_gh_repo_url_status_code == 404: + gh_repo_four_o_four = f"The cocoapods.org GitHub repo url for {name} returns 404" + logger.error(gh_repo_four_o_four) + print(gh_repo_four_o_four) + pod_summary["gh_repo_four_o_four"] = gh_repo_four_o_four + + name = cocoapods_org_gh_repo_name + base_path = "https://api.github.com/repos" + api_url = f"{base_path}/{cocoapods_org_gh_repo_owner}/{cocoapods_org_gh_repo_name}" + response = get_github_rest_no_exception(api_url) + + if "Failed to fetch" in response: + logger.error(f"{response}") + print(f"{response}") + + pod_summary["cocoapods_org_gh_repo_owner"] = cocoapods_org_gh_repo_owner + pod_summary["cocoapods_org_gh_repo_name"] = cocoapods_org_gh_repo_name + pod_summary["cocoapods_org_gh_repo_url"] = cocoapods_org_gh_repo_url + pod_summary["cocoapods_org_podspec_url"] = cocoapods_org_podspec_url + pod_summary["cocoapods_org_pkg_home_url"] = cocoapods_org_pkg_home_url + + if cocoapods_org_gh_repo_owner is None or cocoapods_org_gh_repo_name is None: + no_github_repo = f"No GitHub repo found on cocoapods.org for {name}" + print(f"{no_github_repo}") + logger.warning(no_github_repo) + pod_summary["no_github_repo"] = no_github_repo + + if cocoapods_org_podspec_url is None: + no_podspec = f"No podspec found on cocoapods.org for {name}" + print(f"{no_podspec}") + logger.warning(no_podspec) + pod_summary["no_podspec"] = no_podspec + + cocoapods_org_version = None + if cocoapods_org_podspec_url: + cocoapods_org_version = cocoapods_org_podspec_url.split("/")[-2] + + cocoapods_org_pod_name = None + head = soup.find("head") + if head: + og_title_tag = head.find("meta", property="og:title") + if og_title_tag: + og_title = og_title_tag.get("content") + cocoapods_org_pod_name = og_title + else: + no_meta_tag = f"'og:title' meta tag not found in cocoapods.org page for {purl}" + print(no_meta_tag) + logger.error(no_meta_tag) + else: + no_head_section = f"\n section not found in cocoapods.org page for {purl}" + print(no_head_section) + logger.error(no_head_section) + + pod_summary["cocoapods_org_pod_name"] = cocoapods_org_pod_name + + input_name = pod_summary["input_name"] + if input_name != cocoapods_org_pod_name: + name_change = (f"Input PURL name '{input_name}' analyzed as '{cocoapods_org_pod_name}' per {cocoapods_org_url}") + input_name = cocoapods_org_pod_name + print(f"{name_change}") + logger.warn(name_change) + + api = "https://cdn.cocoapods.org" + hashed_path = get_hashed_path(cocoapods_org_pod_name) + hashed_path_underscore = hashed_path.replace("/", "_") + file_prefix = "all_pods_versions_" + spec = f"{api}/{file_prefix}{hashed_path_underscore}.txt" + data_list = get_cocoapod_tags(spec, cocoapods_org_pod_name) + + print(f"\npod_summary = {json.dumps(pod_summary, indent=4, sort_keys=False)}\n") + + if not version: + version = cocoapods_org_version + + for tag in data_list: + if purl.version and tag != purl.version: + continue + + tag_pkg = construct_cocoapods_package(purl, name, hashed_path, repository_homepage_url, cocoapods_org_gh_repo_owner, cocoapods_org_gh_repo_name, tag, pod_summary) + yield tag_pkg + + if purl.version: + break + + +def get_cocoapod_tags(spec, cocoapods_org_pod_name): + try: + response = get_complete_response(spec) + response.raise_for_status() + data = response.text.strip() + for line in data.splitlines(): + line = line.strip() + if line.startswith(cocoapods_org_pod_name): + data_list = line.split("/") + if data_list[0] == cocoapods_org_pod_name: + data_list.pop(0) + sorted_data_list = sorted( + data_list, + key=lambda x: versions.SemverVersion(x), + reverse=True, + ) + return sorted_data_list + return None + except: + print(f"Error retrieving data from API") + return None + + +def construct_cocoapods_package( + purl, + name, + hashed_path, + repository_homepage_url, + cocoapods_org_gh_repo_owner, + cocoapods_org_gh_repo_name, + tag, + pod_summary, +): + name = name + homepage_url = None + vcs_url = None + github_url = None + bug_tracking_url = None + code_view_url = None + license_data = None + declared_license = None + primary_language = None + + if cocoapods_org_gh_repo_owner and cocoapods_org_gh_repo_name: + name = cocoapods_org_gh_repo_name + namespace = cocoapods_org_gh_repo_owner + base_path = "https://api.github.com/repos" + api_url = f"{base_path}/{namespace}/{name}" + + response = get_github_rest_no_exception(api_url) + + if "Failed to fetch" not in response: + homepage_url = response.get("homepage") + vcs_url = response.get("git_url") + license_data = response.get("license") or {} + declared_license = license_data.get("spdx_id") + primary_language = response.get("language") + + github_url = "https://github.com" + bug_tracking_url = f"{github_url}/{namespace}/{name}/issues" + code_view_url = f"{github_url}/{namespace}/{name}" + + corrected_name = pod_summary["cocoapods_org_pod_name"] + api_url = f"https://raw.githubusercontent.com/CocoaPods/Specs/master/Specs/{hashed_path}/{corrected_name}/{tag}/{corrected_name}.podspec.json" + + response = get_json_response(api_url) + if "Failed to fetch" in response: + logger.error(f"{response}") + print(f"{response}") + return + + homepage_url = response.get("homepage") + + lic = response.get("license") + extracted_license_statement = None + if isinstance(lic, dict): + extracted_license_statement = lic + else: + extracted_license_statement = lic + if not declared_license: + declared_license = extracted_license_statement + + source = response.get("source") + vcs_url = None + download_url = None + if isinstance(source, dict): + git_url = source.get("git", "") + http_url = source.get("http", "") + if http_url: + download_url = http_url + pod_summary["http_url"] = http_url + if git_url and not http_url: + vcs_url = git_url + if git_url.endswith(".git"): + gh_path = git_url[:-4] + + corrected_tag = tag + if source.get("tag").startswith("v"): + corrected_tag = source.get("tag") + download_url = f"{gh_path}/archive/refs/tags/{corrected_tag}.tar.gz" + else: + download_url = None + elif git_url: + vcs_url = git_url + elif isinstance(source, str): + if not vcs_url: + vcs_url = source + + purl_pkg = Package( + homepage_url=homepage_url, + api_url=api_url, + bug_tracking_url=bug_tracking_url, + code_view_url=code_view_url, + download_url=download_url, + declared_license=declared_license, + primary_language=primary_language, + repository_homepage_url=repository_homepage_url, + vcs_url=vcs_url, + **purl.to_dict(), + ) + purl_pkg.version = tag + + return purl_pkg + + @dataclasses.dataclass class DirectoryListedSource: source_url: str = dataclasses.field( diff --git a/src/fetchcode/packagedcode_models.py b/src/fetchcode/packagedcode_models.py index fd48ae78..29fb0f89 100644 --- a/src/fetchcode/packagedcode_models.py +++ b/src/fetchcode/packagedcode_models.py @@ -27,15 +27,11 @@ from __future__ import print_function from __future__ import unicode_literals -from collections import OrderedDict import logging import sys +from collections import OrderedDict import attr -from packageurl import normalize_qualifiers -from packageurl import PackageURL - -from commoncode.datautils import choices from commoncode.datautils import Boolean from commoncode.datautils import Date from commoncode.datautils import Integer @@ -43,7 +39,9 @@ from commoncode.datautils import Mapping from commoncode.datautils import String from commoncode.datautils import TriBoolean - +from commoncode.datautils import choices +from packageurl import PackageURL +from packageurl import normalize_qualifiers """ Data models for package information and dependencies, abstracting the @@ -247,31 +245,24 @@ def purl(self): self.type, self.namespace, self.name, self.version, self.qualifiers, self.subpath).to_string() - def repository_homepage_url(self, baseurl=default_web_baseurl): - """ - Return the package repository homepage URL for this package, e.g. the - URL to the page for this package in its package repository. This is - typically different from the package homepage URL proper. - Subclasses should override to provide a proper value. - """ - return - - def repository_download_url(self, baseurl=default_download_baseurl): - """ - Return the package repository download URL to download the actual - archive of code of this package. This may be different than the actual - download URL and is computed from the default public respoitory baseurl. - Subclasses should override to provide a proper value. - """ - return - - def api_data_url(self, baseurl=default_api_baseurl): - """ - Return the package repository API URL to obtain structured data for this - package such as the URL to a JSON or XML api. - Subclasses should override to provide a proper value. - """ - return + repository_homepage_url = String( + label='package repository homepage URL.', + help='URL to the page for this package in its package repository. ' + 'This is typically different from the package homepage URL proper.' + ) + + repository_download_url = String( + label='package repository download URL.', + help='download URL to download the actual archive of code of this ' + 'package in its package repository. ' + 'This may be different from the actual download URL.' + ) + + api_data_url = String( + label='package repository API URL.', + help='API URL to obtain structured data for this package such as the ' + 'URL to a JSON or XML api its package repository.' + ) def set_purl(self, package_url): """ @@ -298,9 +289,9 @@ def to_dict(self, **kwargs): mapping = attr.asdict(self, dict_factory=OrderedDict) if not kwargs.get('exclude_properties'): mapping['purl'] = self.purl - mapping['repository_homepage_url'] = self.repository_homepage_url() - mapping['repository_download_url'] = self.repository_download_url() - mapping['api_data_url'] = self.api_data_url() + mapping['repository_homepage_url'] = self.repository_homepage_url + mapping['repository_download_url'] = self.repository_download_url + mapping['api_data_url'] = self.api_data_url if self.qualifiers: mapping['qualifiers'] = normalize_qualifiers(self.qualifiers, encode=False) return mapping @@ -399,7 +390,7 @@ class Package(BasePackage): api_url = String( label='API URL', - help='URL of API for this package.') + help='URL of API for this package.') size = Integer( default=None, @@ -842,4 +833,4 @@ class SquashfsPackage(Package): default_type = 'squashfs' -# TODO: Add VM images formats(VMDK, OVA, OVF, VDI, etc) and Docker/other containers \ No newline at end of file +# TODO: Add VM images formats(VMDK, OVA, OVF, VDI, etc) and Docker/other containers diff --git a/src/fetchcode/utils.py b/src/fetchcode/utils.py index 1e5ab842..46a626bb 100644 --- a/src/fetchcode/utils.py +++ b/src/fetchcode/utils.py @@ -14,7 +14,11 @@ # CONDITIONS OF ANY KIND, either express or implied. See tshe License for the # specific language governing permissions and limitations under the License. +import hashlib import os +import sys +from functools import partial + import requests from dateutil import parser as dateparser from dateutil.parser import ParserError @@ -174,3 +178,104 @@ def get_response(url, headers=None): return resp.json() raise Exception(f"Failed to fetch: {url}") + + +def get_github_rest_no_exception(url): + headers = None + gh_token = get_github_token() + if gh_token: + headers = { + "Authorization": f"Bearer {gh_token}", + } + + return get_json_response(url, headers) + + +def get_json_response(url, headers=None): + """ + Generate `Package` object for a `url` string + """ + resp = requests.get(url, headers=headers) + if resp.status_code == 200: + return resp.json() + + return f"Failed to fetch: {url}" + + +def get_complete_response(url, headers=None, params=None): + """ + Generate `Package` object for a `url` string + """ + resp = requests.get(url, headers=headers, params=params) + if resp.status_code == 200: + return resp + elif resp.status_code == 404: + return "not_found" + + return f"Failed to fetch: {url}" + + +def make_head_request(url, headers=None): + """ + Check whether the URL status code is 200 or not. + """ + try: + resp = requests.head(url, headers=headers) + + return resp + + except requests.exceptions.RequestException as e: + print(f"An error occurred: {e}") + return "cannot_confirm" + + +def get_hashed_path(name): + """ + Returns a string with a part of the file path derived from the md5 hash. + + From https://github.com/CocoaPods/cdn.cocoapods.org: + "There are a set of known prefixes for all Podspec paths, you take the + name of the pod, create a hash (using md5) of it and take the first + three characters." + + """ + if not name: + return + podname = get_podname_proper(name) + if name != podname: + name_to_hash = podname + else: + name_to_hash = name + + hash_init = get_first_three_md5_hash_characters(name_to_hash) + hashed_path = "/".join(list(hash_init)) + + return hashed_path + + +# for FIPS support +sys_v0 = sys.version_info[0] +sys_v1 = sys.version_info[1] +if sys_v0 == 3 and sys_v1 >= 9: + md5_hasher = partial(hashlib.md5, usedforsecurity=False) +else: + md5_hasher = hashlib.md5 + + +def get_podname_proper(podname): + """ + Podnames in cocoapods sometimes are files inside a pods package (like 'OHHTTPStubs/Default') + This returns proper podname in those cases. + """ + if "/" in podname: + return podname.split("/")[0] + return podname + + +def get_first_three_md5_hash_characters(podname): + """ + From https://github.com/CocoaPods/cdn.cocoapods.org: + "There are a set of known prefixes for all Podspec paths, you take the name of the pod, + create a hash (using md5) of it and take the first three characters." + """ + return md5_hasher(podname.encode("utf-8")).hexdigest()[0:3]