From 11fd4c5b4ab4c7dac606d51e3116e9073ebda8c7 Mon Sep 17 00:00:00 2001
From: salman2013 <salman.nawaz@arbisoft.com>
Date: Wed, 17 Apr 2024 17:46:03 +0500
Subject: [PATCH] chore: testing workflow

---
 scripts/find_dependencies.py | 372 +++++++++++++++++++----------------
 1 file changed, 199 insertions(+), 173 deletions(-)

diff --git a/scripts/find_dependencies.py b/scripts/find_dependencies.py
index 89f0ef757231..a70fdb8585ad 100644
--- a/scripts/find_dependencies.py
+++ b/scripts/find_dependencies.py
@@ -70,26 +70,26 @@ def run_command(cmd: str, outfile=None) -> Tuple[bool, str]:
     return proc.returncode == 0, output.strip()
 
 
-@cached
-def npm_repo_url(npm_spec: str) -> Optional[str]:
-    """Given 'jspkg@0.1.0', return a repo url."""
-    pkg, _, ver = npm_spec.rpartition("@")
-    url = f"https://registry.npmjs.org/{pkg}/{ver}"
-    try:
-        resp = requests.get(url, timeout=60)
-        if resp.status_code != 200:
-            print(f"{npm_spec}: {url} -> {resp.status_code}")
-            return None
-        jdata = resp.json()
-    except requests.RequestException as exc:
-        print(f"Couldn't fetch npm data for {npm_spec}: {exc}")
-        return None
-    repo = jdata.get("repository")
-    if repo is None:
-        return None
-    if isinstance(repo, dict):
-        repo = repo["url"]
-    return repo
+# @cached
+# def npm_repo_url(npm_spec: str) -> Optional[str]:
+#     """Given 'jspkg@0.1.0', return a repo url."""
+#     pkg, _, ver = npm_spec.rpartition("@")
+#     url = f"https://registry.npmjs.org/{pkg}/{ver}"
+#     try:
+#         resp = requests.get(url, timeout=60)
+#         if resp.status_code != 200:
+#             print(f"{npm_spec}: {url} -> {resp.status_code}")
+#             return None
+#         jdata = resp.json()
+#     except requests.RequestException as exc:
+#         print(f"Couldn't fetch npm data for {npm_spec}: {exc}")
+#         return None
+#     repo = jdata.get("repository")
+#     if repo is None:
+#         return None
+#     if isinstance(repo, dict):
+#         repo = repo["url"]
+#     return repo
 
 def canonical_url(url: str) -> str:
     """Canonicalize a repo URL, probably on GitHub."""
@@ -106,25 +106,25 @@ def canonical_url(url: str) -> str:
         url = f"https://github.com/{url}"
     return url
 
-@cached
-def find_real_url(url: str) -> Optional[str]:
-    """Find the eventual real url for a redirected url."""
-    while True:
-        try:
-            resp = requests.head(url, timeout=60, allow_redirects=True)
-        except requests.RequestException as exc:
-            print(f"Couldn't fetch {url}: {exc}")
-            return None
-        if resp.status_code == 429:
-            # I didn't know you could get 429 from https://github.com, but you can...
-            wait = int(resp.headers.get("Retry-After", 10))
-            time.sleep(wait + 1)
-        else:
-            break
-
-    if resp.status_code == 200:
-        return resp.url
-    return None
+# @cached
+# def find_real_url(url: str) -> Optional[str]:
+#     """Find the eventual real url for a redirected url."""
+#     while True:
+#         try:
+#             resp = requests.head(url, timeout=60, allow_redirects=True)
+#         except requests.RequestException as exc:
+#             print(f"Couldn't fetch {url}: {exc}")
+#             return None
+#         if resp.status_code == 429:
+#             # I didn't know you could get 429 from https://github.com, but you can...
+#             wait = int(resp.headers.get("Retry-After", 10))
+#             time.sleep(wait + 1)
+#         else:
+#             break
+
+#     if resp.status_code == 200:
+#         return resp.url
+#     return None
 
 
 WORK_DIR = Path("/tmp/unpack_reqs")
@@ -140,104 +140,104 @@ def parallel_map(func, data, description):
                 progress.update(pbar, advance=1)
                 yield result
 
-def write_list(path: str, lines: Iterable[str]):
-    """Write a list of strings to a file."""
-    with Path(path).open("w") as flist:
-        for line in lines:
-            print(line, file=flist)
-
-def check_js_dependencies() -> Iterable[str]:
-    """Check the JS dependencies in package-lock.json, returning a set of repo URLs."""
-    print("Checking JavaScript dependencies")
-    with Path("package-lock.json").open() as lockf:
-        lock_data = json.load(lockf)
-
-    deps = set()
-    for name, pkg in lock_data["packages"].items():
-        name = pkg.get("name") or name
-        name = name.rpartition("node_modules/")[-1]
-        version = pkg.get("version")
-        if version is None:
-            continue
-        deps.add(f"{name}@{version}")
-    write_list("deps.txt", sorted(deps))
-
-    urls = set()
-    for url in parallel_map(npm_repo_url, deps, "Getting npm URLs"):
-        if url:
-            urls.add(canonical_url(url))
-
-    real_urls = set()
-    for url in parallel_map(find_real_url, urls, "Getting real URLs"):
-        if url:
-            real_urls.add(url)
-
-    print(f"{len(deps)} deps, {len(urls)} urls, {len(real_urls)} real urls")
-    write_list("repo_urls.txt", sorted(real_urls))
-    return real_urls
-
-def check_py_dependencies() -> Iterable[str]:
-    """Check the Python dependencies in base.txt, returning a set of repo URLs."""
-    print("Checking Python dependencies")
-
-    print("Creating venv")
-    run_command("python3 -m venv .venv", "make_venv.log")
-    run_command(".venv/bin/python3 -m pip install -U pip", "pip_upgrade.log")
-    print("Downloading packages")
-    run_command(".venv/bin/python3 -m pip download --dest files -r base.txt", "pip_download.log")
-
-    urls = set()
-    for url in parallel_map(repo_url_from_wheel, Path("files").glob("*.whl"), "Examining wheels"):
-        if url:
-            urls.add(canonical_url(url))
-
-    for url in parallel_map(repo_url_from_tgz, Path("files").glob("*.tar.gz"), "Examining tar.gz"):
-        if url:
-            urls.add(canonical_url(url))
-
-    with open("base.txt") as fbase:
-        for line in fbase:
-            if match := re.search(r"https://github.com[^@ #]*(\.git)?", line):
-                urls.add(canonical_url(match[0]))
-
-    real_urls = set()
-    for url in parallel_map(find_real_url, urls, "Getting real URLs"):
-        if url:
-            real_urls.add(url)
-
-    write_list("repo_urls.txt", sorted(real_urls))
-    return real_urls
-
-def matching_text(text, regexes):
-    """Find a line in text matching a regex, and return the first regex group."""
-    for regex in regexes:
-        for line in text.splitlines():
-            if match := re.search(regex, line):
-                return match[1]
-    return None
-
-@cached
-def repo_url_from_wheel(wheel_path: str) -> Optional[str]:
-    """Read metadata from a .whl file, returning the repo URL."""
-    with zipfile.ZipFile(wheel_path) as whl_file:
-        fmetadata = next((f for f in whl_file.namelist() if f.endswith("/METADATA")), None)
-        if fmetadata is None:
-            print(f"No metadata in {wheel_path}")
-            return None
-        with whl_file.open(fmetadata) as inner_file:
-            metadata = inner_file.read().decode("utf-8")
-        return repo_url_from_metadata(wheel_path, metadata)
-
-@cached
-def repo_url_from_tgz(tgz_path: str) -> Optional[str]:
-    """Read metadata from a .tar.gz file, returning the repo URL."""
-    with tarfile.open(tgz_path) as tgz_file:
-        fmetadata = next((f for f in tgz_file.getnames() if f.endswith("/PKG-INFO")), None)
-        if fmetadata is None:
-            print(f"No metadata in {tgz_path}")
-            return None
-        metadata = tgz_file.extractfile(fmetadata).read().decode("utf-8")
-        return repo_url_from_metadata(tgz_path, metadata)
+# def write_list(path: str, lines: Iterable[str]):
+#     """Write a list of strings to a file."""
+#     with Path(path).open("w") as flist:
+#         for line in lines:
+#             print(line, file=flist)
+
+# def check_js_dependencies() -> Iterable[str]:
+#     """Check the JS dependencies in package-lock.json, returning a set of repo URLs."""
+#     print("Checking JavaScript dependencies")
+#     with Path("package-lock.json").open() as lockf:
+#         lock_data = json.load(lockf)
+
+#     deps = set()
+#     for name, pkg in lock_data["packages"].items():
+#         name = pkg.get("name") or name
+#         name = name.rpartition("node_modules/")[-1]
+#         version = pkg.get("version")
+#         if version is None:
+#             continue
+#         deps.add(f"{name}@{version}")
+#     write_list("deps.txt", sorted(deps))
+
+#     urls = set()
+#     for url in parallel_map(npm_repo_url, deps, "Getting npm URLs"):
+#         if url:
+#             urls.add(canonical_url(url))
+
+#     real_urls = set()
+#     for url in parallel_map(find_real_url, urls, "Getting real URLs"):
+#         if url:
+#             real_urls.add(url)
+
+#     print(f"{len(deps)} deps, {len(urls)} urls, {len(real_urls)} real urls")
+#     write_list("repo_urls.txt", sorted(real_urls))
+#     return real_urls
+
+# def check_py_dependencies() -> Iterable[str]:
+#     """Check the Python dependencies in base.txt, returning a set of repo URLs."""
+#     print("Checking Python dependencies")
+
+#     print("Creating venv")
+#     run_command("python3 -m venv .venv", "make_venv.log")
+#     run_command(".venv/bin/python3 -m pip install -U pip", "pip_upgrade.log")
+#     print("Downloading packages")
+#     run_command(".venv/bin/python3 -m pip download --dest files -r base.txt", "pip_download.log")
+
+#     urls = set()
+#     for url in parallel_map(repo_url_from_wheel, Path("files").glob("*.whl"), "Examining wheels"):
+#         if url:
+#             urls.add(canonical_url(url))
+
+#     for url in parallel_map(repo_url_from_tgz, Path("files").glob("*.tar.gz"), "Examining tar.gz"):
+#         if url:
+#             urls.add(canonical_url(url))
+
+#     with open("base.txt") as fbase:
+#         for line in fbase:
+#             if match := re.search(r"https://github.com[^@ #]*(\.git)?", line):
+#                 urls.add(canonical_url(match[0]))
+
+#     real_urls = set()
+#     for url in parallel_map(find_real_url, urls, "Getting real URLs"):
+#         if url:
+#             real_urls.add(url)
+
+#     write_list("repo_urls.txt", sorted(real_urls))
+#     return real_urls
+
+# def matching_text(text, regexes):
+#     """Find a line in text matching a regex, and return the first regex group."""
+#     for regex in regexes:
+#         for line in text.splitlines():
+#             if match := re.search(regex, line):
+#                 return match[1]
+#     return None
+
+# @cached
+# def repo_url_from_wheel(wheel_path: str) -> Optional[str]:
+#     """Read metadata from a .whl file, returning the repo URL."""
+#     with zipfile.ZipFile(wheel_path) as whl_file:
+#         fmetadata = next((f for f in whl_file.namelist() if f.endswith("/METADATA")), None)
+#         if fmetadata is None:
+#             print(f"No metadata in {wheel_path}")
+#             return None
+#         with whl_file.open(fmetadata) as inner_file:
+#             metadata = inner_file.read().decode("utf-8")
+#         return repo_url_from_metadata(wheel_path, metadata)
+
+# @cached
+# def repo_url_from_tgz(tgz_path: str) -> Optional[str]:
+#     """Read metadata from a .tar.gz file, returning the repo URL."""
+#     with tarfile.open(tgz_path) as tgz_file:
+#         fmetadata = next((f for f in tgz_file.getnames() if f.endswith("/PKG-INFO")), None)
+#         if fmetadata is None:
+#             print(f"No metadata in {tgz_path}")
+#             return None
+#         metadata = tgz_file.extractfile(fmetadata).read().decode("utf-8")
+#         return repo_url_from_metadata(tgz_path, metadata)
 
 
 SOURCE_URL_REGEXES = [
@@ -264,9 +264,15 @@ def repo_url_from_metadata(filename, metadata):
 
 # The first of these we find is the requirements file we'll examine:
 PY_REQS = [
-    "requirements/edx/base.txt",
-    "requirements/base.txt",
-    "requirements.txt",
+    # "requirements/edx/base.txt",
+    #"requirements/edx/base.in",
+    "requirements/edx/kernel.in",
+    #"requirements/edx/bundled.in",
+    #"requirements/edx/testing.in",
+    # "requirements/base.txt",
+    #"requirements/base.in",
+    "requirements/kernel.in",
+    # "requirements.txt",
 ]
 
 # Files that indicate a repo is a Python project:
@@ -286,6 +292,19 @@ def find_py_reqs():
         print(f"WARNING: {os.getcwd()} is likely a Python package, but we can't find its dependencies.")
     return None
 
+def request_package_info_url(package):
+        base_url = "https://pypi.org/pypi/"
+        url = f"{base_url}{package}/json"
+        response = requests.get(url)
+        if response.status_code == 200:
+            #print(f"Package: {package}")
+            data_dict = response.json()
+            info = data_dict["info"]
+            home_page = info["home_page"]
+            return home_page
+        else:
+            print(f"Failed to retrieve data for package {package}. Status code:", response.status_code)    
+
 
 def process_directory():
     """
@@ -293,20 +312,40 @@ def process_directory():
 
     Also copies the considered dependencies file into the temp work directory,
     for later analysis.
-    """
+    """    
     repo_name = Path.cwd().name
     repo_work = WORK_DIR / repo_name
     repo_work.mkdir(parents=True, exist_ok=True)
     repo_urls = set()
+    package_names = []
+    openedx_packages = [] 
     if (js_reqs := Path("package-lock.json")).exists():
         shutil.copyfile(js_reqs, repo_work / "package-lock.json")
-        with change_dir(repo_work):
-            repo_urls.update(check_js_dependencies())
+        # with change_dir(repo_work):
+            # repo_urls.update(check_js_dependencies())
     if (py_reqs := find_py_reqs()):
         shutil.copyfile(py_reqs, repo_work / "base.txt")
-        with change_dir(repo_work):
-            repo_urls.update(check_py_dependencies())
-    return repo_urls
+
+        with open(repo_work / "base.txt") as fbase:
+            # Read each line (package name) in the file
+            file_data = fbase.read()
+
+            # Splitting the data by lines
+            lines = file_data.strip().split('\n')
+            for line in lines:
+                # Print the package name
+                parts = line.split('#', 1)
+                package_name = parts[0].strip()
+                package_names.append(package_name)
+
+        for package in package_names:
+            if package != " ":
+                home_page = request_package_info_url(package)
+                if home_page is not None:
+                    if match := urls_in_orgs([home_page], SECOND_PARTY_ORGS):
+                        openedx_packages.append(home_page)
+                
+    return openedx_packages
 
 FIRST_PARTY_ORGS = ["openedx"]
 
@@ -317,23 +356,23 @@ def process_directory():
     "open-craft", "eduNEXT", "raccoongang",
 ]
 
-# def urls_in_orgs(urls, orgs):
-#     """
-#     Find urls that are in any of the `orgs`.
-#     """
-#     return sorted(
-#         url for url in urls
-#         if any(f"/{org}/" in url for org in orgs)
-#     )
-
-def urls_in_orgs(urls, org):
+def urls_in_orgs(urls, orgs):
     """
     Find urls that are in any of the `orgs`.
     """
     return sorted(
         url for url in urls
-        if f"/{org}/" in url
-    )    
+        if any(f"/{org}/" in url for org in orgs)
+    )
+
+# def urls_in_orgs(urls, org):
+#     """
+#     Find urls that are in any of the `orgs`.
+#     """
+#     return sorted(
+#         url for url in urls
+#         if f"/{org}/" in url
+#     )    
 
 
 def main(dirs=None, org=None):
@@ -345,31 +384,18 @@ def main(dirs=None, org=None):
         repo_dir = sys.argv[1]
         org_flag_index = sys.argv.index("--org")
         org = sys.argv[org_flag_index + 1]
-    #import pdb; pdb.set_trace()
     print(f"Creating new work directory: {WORK_DIR}")
     shutil.rmtree(WORK_DIR, ignore_errors=True)
-
-    #dirs = sorted(itertools.chain.from_iterable(d.splitlines() for d in dirs))
     repo_urls = set()
 
-    #for i_dir, repo_dir in enumerate(dirs, start=1):
-#        print(f"-- repo {repo_dir} ({i_dir}/{len(dirs)}) ----------")
     with change_dir(repo_dir):
         repo_urls.update(process_directory())
 
-    write_list(WORK_DIR / "repo_urls.txt", sorted(repo_urls))
-
-    #firsts = urls_in_orgs(repo_urls, FIRST_PARTY_ORGS)
-    #write_list(WORK_DIR / "first_party_urls.txt", firsts)
-    seconds = urls_in_orgs(repo_urls, org)
-    #write_list(WORK_DIR / "second_party_urls.txt", seconds)
-
     print("== DONE ==============")
     print("Second-party:")
-    print("\n".join(seconds))
-    if seconds:
+    print("\n".join(repo_urls))
+    if repo_urls:
         sys.exit(1)
 
-
 if __name__ == "__main__":
     main()