Merge pull request #7 from cristian-rincon/develop

release: 0.2.3
cristian-rincon · Jun 23, 2023 · 82ad028 · 82ad028
2 parents 547ced9 + 0657833
commit 82ad028
Show file tree

Hide file tree

Showing 8 changed files with 99 additions and 210 deletions.
diff --git a/extractor/checks.py b/extractor/checks.py
@@ -49,23 +49,17 @@ def project_url(self, pattern: str, project_url: str, project_urls: Dict) -> str
             logger.debug("Nested metadata found")
             return self.additional_urls(pattern, project_urls)
 
-    def version(
-        self, version: str, pattern: str, raw_data: Dict, filtered_data: Dict
-    ) -> Dict:
+    def version(self, version: str, pattern: str, filtered_data: Dict) -> Dict:
         project_default_error = "No project url found, please check manually"
-        version_default_error = "No version url found, please check manually"
         if version and self.gh_pattern(
             pattern, filtered_data.get("project_url"), project_default_error
         ):
             filtered_data[
                 "version_url"
             ] = f"{filtered_data['project_url']}/tree/{version}/"
-        elif raw_data["release_url"] and self.gh_pattern(
-            pattern, filtered_data.get("project_url"), version_default_error
-        ):
-            filtered_data["version_url"] = raw_data["release_url"]
 
         else:
+            version_default_error = "No version url found, please check manually"
             filtered_data["version_url"] = version_default_error
-            filtered_data["project_url"] = project_default_error
+
         return filtered_data
diff --git a/extractor/core.py b/extractor/core.py
@@ -46,14 +46,16 @@ def filter_data(raw_data: Dict[str, str], version: str) -> Dict[str, str]:
     project_name = raw_data["name"]
     project_url = raw_data["project_url"]
     project_urls = raw_data["project_urls"]
+    project_version = version or raw_data["version"]
     check = StandardCheck()
+    pypi_url = f"https://pypi.org/project/{project_name}/{project_version}/"
     gh_url_pattern = r"(https:\/\/|http:\/\/)github\.com"
     filtered_data = {
         "name": project_name,
-        "version": version or raw_data["version"],
+        "version": project_version,
         "license": check.licenses(raw_data),
-        "homepage": raw_data["home_page"],
-        "release_url": raw_data["release_url"],
+        # "homepage": raw_data["home_page"],
+        "pypi_release_url": pypi_url,
         "project_url": check.project_url(gh_url_pattern, project_url, project_urls),
     }
 
@@ -63,7 +65,8 @@ def filter_data(raw_data: Dict[str, str], version: str) -> Dict[str, str]:
         version = f"v{filtered_data['version']}"
         filtered_data["version"] = version
 
-    filtered_data = check.version(version, gh_url_pattern, raw_data, filtered_data)
+    filtered_data = check.version(version, gh_url_pattern, filtered_data)
+    del filtered_data["project_url"]
     return filtered_data
 
 
@@ -81,13 +84,14 @@ def extract_data(source_path: Path, format: str) -> None:
     logger.info("Starting process")
     logger.debug(f"Retrieving: {source_path}")
     result = Requirements().render(source_path, format)
+    custom_columns_order = ["license", "name"]
     pkgs_raw_metadata = []
     for pkg in track(result):
         filtered_data = filter_data(
             get_raw_data(pkg[0]), pkg[1] if len(pkg) > 1 else None
         )
         pkgs_raw_metadata.append(filtered_data)
-    return pd.DataFrame(pkgs_raw_metadata)
+    return pd.DataFrame(pkgs_raw_metadata).sort_values(by=custom_columns_order)
 
 
 def save_data(data: pd.DataFrame, output: Path):

diff --git a/extractor/main.py b/extractor/main.py
@@ -8,7 +8,7 @@
 from extractor.render import RequirementsFormat
 
 app = typer.Typer()
-VERSION = "0.2.2"
+VERSION = "0.2.3"
 
 
 @app.command(name="version")

diff --git a/mocks/in/pip_freeze/pip_freeze_fmt_sample.txt b/mocks/in/pip_freeze/pip_freeze_fmt_sample.txt
@@ -1,34 +1,13 @@
-black==23.3.0
-certifi==2023.5.7
-charset-normalizer==3.1.0
-click==8.1.3
-et-xmlfile==1.1.0
-idna==3.4
-loguru==0.7.0
-markdown-it-py==2.2.0
-mdurl==0.1.2
-mypy-extensions==1.0.0
-numpy==1.24.3
-openpyxl==3.1.2
-packaging==23.1
-pandas==2.0.2
-pathspec==0.11.1
-platformdirs==3.5.1
-Pygments==2.15.1
-python-dateutil==2.8.2
-pytz==2023.3
-requests==2.31.0
-rich==13.4.1
-shellingham==1.5.0.post1
-six==1.16.0
-tomli==2.0.1
-tqdm==4.65.0
-typer==0.9.0
-typing_extensions==4.6.3
-tzdata==2023.3
-urllib3==2.0.3
-pandas
-great-expectations
-setuptools
-wheel
-typing-extensions
+setuptools==68.0.0
+wheel==0.40.0
+dbx==0.8.10
+typing_extensions==4.5.0
+pytest==7.3.1
+pytest-cov==4.1.0
+pre-commit==3.3.2
+pyspark==3.3.2
+delta-spark==2.3.0
+mlflow==1.29.0
+matplotlib==3.7.1
+databricks-feature-store==0.11.0
+mlflow-skinny==2.3.2
diff --git a/mocks/in/pip_freeze/pip_freeze_fmt_sample_2.txt b/mocks/in/pip_freeze/pip_freeze_fmt_sample_2.txt
@@ -1,128 +1,9 @@
-anyio==3.7.0
-appnope==0.1.3
-argon2-cffi==21.3.0
-argon2-cffi-bindings==21.2.0
-arrow==1.2.3
-astroid==2.15.5
-asttokens==2.2.1
-async-lru==2.0.2
-attrs==23.1.0
-Babel==2.12.1
-backcall==0.2.0
-beautifulsoup4==4.12.2
-black==23.3.0
-bleach==6.0.0
-certifi==2023.5.7
-cffi==1.15.1
-cfgv==3.3.1
-charset-normalizer==3.1.0
-click==8.1.3
-colorama==0.4.6
-comm==0.1.3
-debugpy==1.6.7
-decorator==5.1.1
-defusedxml==0.7.1
-dill==0.3.6
-distlib==0.3.6
-et-xmlfile==1.1.0
-exceptiongroup==1.1.1
-executing==1.2.0
-fastjsonschema==2.17.1
-filelock==3.12.2
-fqdn==1.5.1
-identify==2.5.24
-idna==3.4
-importlib-metadata==6.7.0
-importlib-resources==5.12.0
-iniconfig==2.0.0
-ipykernel==6.23.2
-ipython==8.12.2
-isoduration==20.11.0
-isort==5.12.0
-jedi==0.18.2
-Jinja2==3.1.2
-json5==0.9.14
-jsonpointer==2.4
-jsonschema==4.17.3
-jupyter-events==0.6.3
-jupyter-lsp==2.2.0
-jupyter_client==8.2.0
-jupyter_core==5.3.1
-jupyter_server==2.6.0
-jupyter_server_terminals==0.4.4
-jupyterlab==4.0.2
-jupyterlab-pygments==0.2.2
-jupyterlab_server==2.23.0
-lazy-object-proxy==1.9.0
 loguru==0.7.0
-markdown-it-py==2.2.0
-MarkupSafe==2.1.3
-matplotlib-inline==0.1.6
-mccabe==0.7.0
-mdurl==0.1.2
-mistune==3.0.1
-mypy-extensions==1.0.0
-nbclient==0.8.0
-nbconvert==7.6.0
-nbformat==5.9.0
-nest-asyncio==1.5.6
-nodeenv==1.8.0
-notebook_shim==0.2.3
-numpy==1.24.3
-openpyxl==3.1.2
-overrides==7.3.1
-packaging==23.1
+Jinja2==3.0.3
+azure-storage-blob==12.16.0
+azure-keyvault-secrets==4.7.0
+scikit-learn==1.2.1
+great_expectations==0.15.47
 pandas==2.0.2
-pandocfilters==1.5.0
-parso==0.8.3
-pathspec==0.11.1
-pexpect==4.8.0
-pickleshare==0.7.5
-pkgutil_resolve_name==1.3.10
-platformdirs==3.5.1
-pluggy==1.0.0
-pre-commit==3.3.2
-prometheus-client==0.17.0
-prompt-toolkit==3.0.38
-psutil==5.9.5
-ptyprocess==0.7.0
-pure-eval==0.2.2
-pycparser==2.21
-Pygments==2.15.1
-pylint==2.17.4
-pyrsistent==0.19.3
-pytest==7.3.1
-python-dateutil==2.8.2
-python-json-logger==2.0.7
-pytz==2023.3
-PyYAML==6.0
-pyzmq==25.1.0
-requests==2.31.0
-rfc3339-validator==0.1.4
-rfc3986-validator==0.1.1
-rich==13.4.1
-Send2Trash==1.8.2
-shellingham==1.5.0.post1
-six==1.16.0
-sniffio==1.3.0
-soupsieve==2.4.1
-stack-data==0.6.2
-terminado==0.17.1
-tinycss2==1.2.1
-toml==0.10.2
-tomli==2.0.1
-tomlkit==0.11.8
-tornado==6.3.2
-traitlets==5.9.0
-typer==0.9.0
-typing_extensions==4.6.3
-tzdata==2023.3
-uri-template==1.3.0
-urllib3==2.0.3
-virtualenv==20.23.0
-wcwidth==0.2.6
-webcolors==1.13
-webencodings==0.5.1
-websocket-client==1.6.0
-wrapt==1.15.0
-zipp==3.15.0
+# Sphinx
+# sphinx-rtd-theme
diff --git a/notebooks/sandbox.ipynb b/notebooks/sandbox.ipynb
@@ -82,59 +82,90 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 5,
    "id": "94340c07-1598-4aa9-98fc-45ff005eaeaf",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "{'author': None,\n",
-       " 'author_email': 'Leonard Richardson <[email protected]>',\n",
+       "{'author': 'Thunder Shiviah, Michael Shtelma, Ivan Trusov',\n",
+       " 'author_email': '',\n",
        " 'bugtrack_url': None,\n",
-       " 'classifiers': ['Development Status :: 5 - Production/Stable',\n",
-       "  'Intended Audience :: Developers',\n",
-       "  'License :: OSI Approved :: MIT License',\n",
-       "  'Programming Language :: Python',\n",
-       "  'Programming Language :: Python :: 3',\n",
-       "  'Topic :: Software Development :: Libraries :: Python Modules',\n",
-       "  'Topic :: Text Processing :: Markup :: HTML',\n",
-       "  'Topic :: Text Processing :: Markup :: SGML',\n",
-       "  'Topic :: Text Processing :: Markup :: XML'],\n",
-       " 'description': 'Beautiful Soup is a library that makes it easy to scrape information\\nfrom web pages. It sits atop an HTML or XML parser, providing Pythonic\\nidioms for iterating, searching, and modifying the parse tree.\\n\\n# Quick start\\n\\n```\\n>>> from bs4 import BeautifulSoup\\n>>> soup = BeautifulSoup(\"<p>Some<b>bad<i>HTML\")\\n>>> print(soup.prettify())\\n<html>\\n <body>\\n  <p>\\n   Some\\n   <b>\\n    bad\\n    <i>\\n     HTML\\n    </i>\\n   </b>\\n  </p>\\n </body>\\n</html>\\n>>> soup.find(text=\"bad\")\\n\\'bad\\'\\n>>> soup.i\\n<i>HTML</i>\\n#\\n>>> soup = BeautifulSoup(\"<tag1>Some<tag2/>bad<tag3>XML\", \"xml\")\\n#\\n>>> print(soup.prettify())\\n<?xml version=\"1.0\" encoding=\"utf-8\"?>\\n<tag1>\\n Some\\n <tag2/>\\n bad\\n <tag3>\\n  XML\\n </tag3>\\n</tag1>\\n```\\n\\nTo go beyond the basics, [comprehensive documentation is available](https://www.crummy.com/software/BeautifulSoup/bs4/doc/).\\n\\n# Links\\n\\n* [Homepage](https://www.crummy.com/software/BeautifulSoup/bs4/)\\n* [Documentation](https://www.crummy.com/software/BeautifulSoup/bs4/doc/)\\n* [Discussion group](https://groups.google.com/group/beautifulsoup/)\\n* [Development](https://code.launchpad.net/beautifulsoup/)\\n* [Bug tracker](https://bugs.launchpad.net/beautifulsoup/)\\n* [Complete changelog](https://bazaar.launchpad.net/~leonardr/beautifulsoup/bs4/view/head:/CHANGELOG)\\n\\n# Note on Python 2 sunsetting\\n\\nBeautiful Soup\\'s support for Python 2 was discontinued on December 31,\\n2020: one year after the sunset date for Python 2 itself. From this\\npoint onward, new Beautiful Soup development will exclusively target\\nPython 3. The final release of Beautiful Soup 4 to support Python 2\\nwas 4.9.3.\\n\\n# Supporting the project\\n\\nIf you use Beautiful Soup as part of your professional work, please consider a\\n[Tidelift subscription](https://tidelift.com/subscription/pkg/pypi-beautifulsoup4?utm_source=pypi-beautifulsoup4&utm_medium=referral&utm_campaign=readme).\\nThis will support many of the free software projects your organization\\ndepends on, not just Beautiful Soup.\\n\\nIf you use Beautiful Soup for personal projects, the best way to say\\nthank you is to read\\n[Tool Safety](https://www.crummy.com/software/BeautifulSoup/zine/), a zine I\\nwrote about what Beautiful Soup has taught me about software\\ndevelopment.\\n\\n# Building the documentation\\n\\nThe bs4/doc/ directory contains full documentation in Sphinx\\nformat. Run `make html` in that directory to create HTML\\ndocumentation.\\n\\n# Running the unit tests\\n\\nBeautiful Soup supports unit test discovery using Pytest:\\n\\n```\\n$ pytest\\n```\\n\\n',\n",
+       " 'classifiers': ['Intended Audience :: Developers',\n",
+       "  'Intended Audience :: System Administrators'],\n",
+       " 'description': '# dbx by Databricks Labs\\n\\n<p align=\"center\">\\n    <a href=\"https://dbx.readthedocs.io/\">\\n        <img src=\"https://raw.githubusercontent.com/databrickslabs/dbx/master/images/logo.svg\" class=\"align-center\" width=\"200\" height=\"200\" alt=\"logo\" />\\n    </a>\\n</p>\\n\\n<p align=\"center\">\\n    <b>🧱Databricks CLI eXtensions - aka <code>dbx</code> is a CLI tool for development and advanced Databricks workflows management.</b>\\n</p>\\n\\n---\\n\\n<p align=\"center\">\\n    <a href=\"https://dbx.readthedocs.io/en/latest/?badge=latest\">\\n        <img src=\"https://img.shields.io/readthedocs/dbx?style=for-the-badge\" alt=\"Documentation Status\"/>\\n    </a>\\n    <a href=\"https://pypi.org/project/dbx/\">\\n        <img src=\"https://img.shields.io/pypi/v/dbx?color=green&amp;style=for-the-badge\" alt=\"Latest Python Release\"/>\\n    </a>\\n    <a href=\"https://codecov.io/gh/databrickslabs/dbx\">\\n        <img src=\"https://img.shields.io/codecov/c/github/databrickslabs/dbx?style=for-the-badge&amp;token=S7ADH3W2E3\"\\n             alt=\"codecov\"/>\\n    </a>\\n    <a href=\"https://pypistats.org/packages/dbx\">\\n        <img src=\"https://img.shields.io/pypi/dm/dbx?style=for-the-badge\" alt=\"downloads\"/>\\n    </a>\\n    <a href=\"https://github.com/psf/black\">\\n        <img src=\"https://img.shields.io/badge/code%20style-black-000000.svg?style=for-the-badge\"\\n             alt=\"We use black for formatting\"/>\\n    </a>\\n</p>\\n\\n---\\n\\n## Concept\\n\\n`dbx` simplifies Databricks workflows development, deployment and launch across multiple\\nenvironments. It also helps to package your project and deliver it to\\nyour Databricks environment in a versioned fashion. Designed in a\\nCLI-first manner, it is built to be actively used both inside CI/CD\\npipelines and as a part of local tooling for rapid prototyping.\\n\\n## Requirements\\n\\n- Python Version \\\\> 3.8\\n- `pip` or `conda`\\n\\n## Installation\\n\\n- with `pip`:\\n\\n```\\npip install dbx\\n```\\n\\n## Documentation\\n\\nPlease refer to the [docs page](https://dbx.readthedocs.io/en/latest/index.html).\\n\\n## Interface versioning\\n\\nFor CLI interfaces, we support [SemVer](https://semver.org/) approach.\\nHowever, for API components we don\\'t use SemVer as of now. This may lead\\nto instability when using `dbx` API methods directly.\\n\\n## Legal Information\\n\\nThis software is provided as-is and is not officially supported by\\nDatabricks through customer technical support channels. Support,\\nquestions, and feature requests can be communicated through the Issues\\npage of this repo. Please see the legal agreement and understand that\\nissues with the use of this code will not be answered or investigated by\\nDatabricks Support.\\n\\n## Feedback\\n\\nIssues with `dbx`? Found a bug? Have a great idea for an addition? Feel\\nfree to file an\\n[issue](https://github.com/databrickslabs/dbx/issues/new/choose).\\n\\n## Contributing\\n\\nPlease find more details about contributing to `dbx` in the contributing\\n[doc](https://github.com/databrickslabs/dbx/blob/master/contrib/CONTRIBUTING.md).\\n',\n",
        " 'description_content_type': 'text/markdown',\n",
        " 'docs_url': None,\n",
-       " 'download_url': None,\n",
+       " 'download_url': '',\n",
        " 'downloads': {'last_day': -1, 'last_month': -1, 'last_week': -1},\n",
-       " 'home_page': None,\n",
-       " 'keywords': 'HTML,XML,parse,soup',\n",
-       " 'license': None,\n",
-       " 'maintainer': None,\n",
-       " 'maintainer_email': None,\n",
-       " 'name': 'beautifulsoup4',\n",
-       " 'package_url': 'https://pypi.org/project/beautifulsoup4/',\n",
+       " 'home_page': '',\n",
+       " 'keywords': '',\n",
+       " 'license': 'Databricks License',\n",
+       " 'maintainer': '',\n",
+       " 'maintainer_email': '',\n",
+       " 'name': 'dbx',\n",
+       " 'package_url': 'https://pypi.org/project/dbx/',\n",
        " 'platform': None,\n",
-       " 'project_url': 'https://pypi.org/project/beautifulsoup4/',\n",
-       " 'project_urls': {'Download': 'https://www.crummy.com/software/BeautifulSoup/bs4/download/',\n",
-       "  'Homepage': 'https://www.crummy.com/software/BeautifulSoup/bs4/'},\n",
-       " 'release_url': 'https://pypi.org/project/beautifulsoup4/4.12.2/',\n",
-       " 'requires_dist': ['soupsieve>1.2',\n",
-       "  \"html5lib; extra == 'html5lib'\",\n",
-       "  \"lxml; extra == 'lxml'\"],\n",
-       " 'requires_python': '>=3.6.0',\n",
-       " 'summary': 'Screen-scraping library',\n",
-       " 'version': '4.12.2',\n",
+       " 'project_url': 'https://pypi.org/project/dbx/',\n",
+       " 'project_urls': None,\n",
+       " 'release_url': 'https://pypi.org/project/dbx/0.8.17/',\n",
+       " 'requires_dist': ['requests (<3.0.0,>=2.30.1)',\n",
+       "  'mlflow-skinny (<3.0.0,>=2.0.0)',\n",
+       "  'databricks-cli (<0.18,>=0.17)',\n",
+       "  'tenacity (<=9.0.0,>=8.2.2)',\n",
+       "  'click (<9.0.0,>=8.1.0)',\n",
+       "  'rich (==12.6.0)',\n",
+       "  'typer[all] (==0.7.0)',\n",
+       "  'cookiecutter (<3.0.0,>2.1.0)',\n",
+       "  'pyyaml (>=6.0)',\n",
+       "  'pydantic (<2.0.0,>=1.10.8)',\n",
+       "  'Jinja2 (>=2.11.2)',\n",
+       "  'cryptography (<42.0.0,>=41.0.0)',\n",
+       "  'aiohttp (>=3.8.2)',\n",
+       "  'pathspec (>=0.9.0)',\n",
+       "  'watchdog (>=2.1.0)',\n",
+       "  \"boto3 (<2,>=1.26.13) ; extra == 'aws'\",\n",
+       "  \"azure-storage-blob (<13.0.0,>=12.14.1) ; extra == 'azure'\",\n",
+       "  \"azure-identity (<2.0.0,>=1.12.0) ; extra == 'azure'\",\n",
+       "  \"mkdocs (<2.0.0,>=1.1.2) ; extra == 'dev'\",\n",
+       "  \"mkdocs-click (<1.0,>=0.8.0) ; extra == 'dev'\",\n",
+       "  \"mkdocs-material (<10.0.0,>=9.0.8) ; extra == 'dev'\",\n",
+       "  \"mdx-include (<2.0.0,>=1.4.1) ; extra == 'dev'\",\n",
+       "  \"mkdocs-markdownextradata-plugin (<0.3.0,>=0.1.7) ; extra == 'dev'\",\n",
+       "  \"mkdocs-glightbox (<1.0,>=0.2.1) ; extra == 'dev'\",\n",
+       "  \"mkdocs-git-revision-date-localized-plugin (<=2.0,>=1.1.0) ; extra == 'dev'\",\n",
+       "  \"pre-commit (<4.0.0,>=2.20.0) ; extra == 'dev'\",\n",
+       "  \"pylint (==2.15.6) ; extra == 'dev'\",\n",
+       "  \"pycodestyle (==2.8.0) ; extra == 'dev'\",\n",
+       "  \"pyflakes (==2.5.0) ; extra == 'dev'\",\n",
+       "  \"mccabe (==0.6.1) ; extra == 'dev'\",\n",
+       "  \"prospector (==1.7.7) ; extra == 'dev'\",\n",
+       "  \"black (<23.0.0,>=22.3.0) ; extra == 'dev'\",\n",
+       "  \"MarkupSafe (<3.0.0,>=2.1.1) ; extra == 'dev'\",\n",
+       "  \"pytest (<8.0.0,>=7.1.3) ; extra == 'dev'\",\n",
+       "  \"pytest-mock (<3.11.0,>=3.8.2) ; extra == 'dev'\",\n",
+       "  \"pytest-xdist[psutil] (<3.0.0,>=2.5.0) ; extra == 'dev'\",\n",
+       "  \"pytest-asyncio (<1.0.0,>=0.18.3) ; extra == 'dev'\",\n",
+       "  \"pytest-cov (<5.0.0,>=4.0.0) ; extra == 'dev'\",\n",
+       "  \"pytest-timeout (<3.0.0,>=2.1.0) ; extra == 'dev'\",\n",
+       "  \"pytest-clarity (<2.0.0,>=1.0.1) ; extra == 'dev'\",\n",
+       "  \"poetry (>=1.2.0) ; extra == 'dev'\",\n",
+       "  \"google-cloud-storage (<3.0.0,>=2.6.0) ; extra == 'gcp'\"],\n",
+       " 'requires_python': '>=3.8',\n",
+       " 'summary': 'DataBricks CLI eXtensions aka dbx',\n",
+       " 'version': '0.8.17',\n",
        " 'yanked': False,\n",
        " 'yanked_reason': None}"
       ]
      },
-     "execution_count": 23,
+     "execution_count": 5,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "data = get_raw_data(\"beautifulsoup4\")\n",
+    "data = get_raw_data(\"dbx\")\n",
     "data"
    ]
   },

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "pymetasnap"
-version = "0.2.2"
+version = "0.2.3"
 description = "This package allows you to scrape metadata from the Python Package Index"
 authors = ["cristian-rincon <[email protected]>"]
 license = "MIT"