Skip to content

Commit

Permalink
Merge pull request #7 from cristian-rincon/develop
Browse files Browse the repository at this point in the history
release: 0.2.3
  • Loading branch information
cristian-rincon authored Jun 23, 2023
2 parents 547ced9 + 0657833 commit 82ad028
Show file tree
Hide file tree
Showing 8 changed files with 99 additions and 210 deletions.
12 changes: 3 additions & 9 deletions extractor/checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,23 +49,17 @@ def project_url(self, pattern: str, project_url: str, project_urls: Dict) -> str
logger.debug("Nested metadata found")
return self.additional_urls(pattern, project_urls)

def version(
self, version: str, pattern: str, raw_data: Dict, filtered_data: Dict
) -> Dict:
def version(self, version: str, pattern: str, filtered_data: Dict) -> Dict:
project_default_error = "No project url found, please check manually"
version_default_error = "No version url found, please check manually"
if version and self.gh_pattern(
pattern, filtered_data.get("project_url"), project_default_error
):
filtered_data[
"version_url"
] = f"{filtered_data['project_url']}/tree/{version}/"
elif raw_data["release_url"] and self.gh_pattern(
pattern, filtered_data.get("project_url"), version_default_error
):
filtered_data["version_url"] = raw_data["release_url"]

else:
version_default_error = "No version url found, please check manually"
filtered_data["version_url"] = version_default_error
filtered_data["project_url"] = project_default_error

return filtered_data
14 changes: 9 additions & 5 deletions extractor/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,14 +46,16 @@ def filter_data(raw_data: Dict[str, str], version: str) -> Dict[str, str]:
project_name = raw_data["name"]
project_url = raw_data["project_url"]
project_urls = raw_data["project_urls"]
project_version = version or raw_data["version"]
check = StandardCheck()
pypi_url = f"https://pypi.org/project/{project_name}/{project_version}/"
gh_url_pattern = r"(https:\/\/|http:\/\/)github\.com"
filtered_data = {
"name": project_name,
"version": version or raw_data["version"],
"version": project_version,
"license": check.licenses(raw_data),
"homepage": raw_data["home_page"],
"release_url": raw_data["release_url"],
# "homepage": raw_data["home_page"],
"pypi_release_url": pypi_url,
"project_url": check.project_url(gh_url_pattern, project_url, project_urls),
}

Expand All @@ -63,7 +65,8 @@ def filter_data(raw_data: Dict[str, str], version: str) -> Dict[str, str]:
version = f"v{filtered_data['version']}"
filtered_data["version"] = version

filtered_data = check.version(version, gh_url_pattern, raw_data, filtered_data)
filtered_data = check.version(version, gh_url_pattern, filtered_data)
del filtered_data["project_url"]
return filtered_data


Expand All @@ -81,13 +84,14 @@ def extract_data(source_path: Path, format: str) -> None:
logger.info("Starting process")
logger.debug(f"Retrieving: {source_path}")
result = Requirements().render(source_path, format)
custom_columns_order = ["license", "name"]
pkgs_raw_metadata = []
for pkg in track(result):
filtered_data = filter_data(
get_raw_data(pkg[0]), pkg[1] if len(pkg) > 1 else None
)
pkgs_raw_metadata.append(filtered_data)
return pd.DataFrame(pkgs_raw_metadata)
return pd.DataFrame(pkgs_raw_metadata).sort_values(by=custom_columns_order)


def save_data(data: pd.DataFrame, output: Path):
Expand Down
2 changes: 1 addition & 1 deletion extractor/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from extractor.render import RequirementsFormat

app = typer.Typer()
VERSION = "0.2.2"
VERSION = "0.2.3"


@app.command(name="version")
Expand Down
47 changes: 13 additions & 34 deletions mocks/in/pip_freeze/pip_freeze_fmt_sample.txt
Original file line number Diff line number Diff line change
@@ -1,34 +1,13 @@
black==23.3.0
certifi==2023.5.7
charset-normalizer==3.1.0
click==8.1.3
et-xmlfile==1.1.0
idna==3.4
loguru==0.7.0
markdown-it-py==2.2.0
mdurl==0.1.2
mypy-extensions==1.0.0
numpy==1.24.3
openpyxl==3.1.2
packaging==23.1
pandas==2.0.2
pathspec==0.11.1
platformdirs==3.5.1
Pygments==2.15.1
python-dateutil==2.8.2
pytz==2023.3
requests==2.31.0
rich==13.4.1
shellingham==1.5.0.post1
six==1.16.0
tomli==2.0.1
tqdm==4.65.0
typer==0.9.0
typing_extensions==4.6.3
tzdata==2023.3
urllib3==2.0.3
pandas
great-expectations
setuptools
wheel
typing-extensions
setuptools==68.0.0
wheel==0.40.0
dbx==0.8.10
typing_extensions==4.5.0
pytest==7.3.1
pytest-cov==4.1.0
pre-commit==3.3.2
pyspark==3.3.2
delta-spark==2.3.0
mlflow==1.29.0
matplotlib==3.7.1
databricks-feature-store==0.11.0
mlflow-skinny==2.3.2
133 changes: 7 additions & 126 deletions mocks/in/pip_freeze/pip_freeze_fmt_sample_2.txt
Original file line number Diff line number Diff line change
@@ -1,128 +1,9 @@
anyio==3.7.0
appnope==0.1.3
argon2-cffi==21.3.0
argon2-cffi-bindings==21.2.0
arrow==1.2.3
astroid==2.15.5
asttokens==2.2.1
async-lru==2.0.2
attrs==23.1.0
Babel==2.12.1
backcall==0.2.0
beautifulsoup4==4.12.2
black==23.3.0
bleach==6.0.0
certifi==2023.5.7
cffi==1.15.1
cfgv==3.3.1
charset-normalizer==3.1.0
click==8.1.3
colorama==0.4.6
comm==0.1.3
debugpy==1.6.7
decorator==5.1.1
defusedxml==0.7.1
dill==0.3.6
distlib==0.3.6
et-xmlfile==1.1.0
exceptiongroup==1.1.1
executing==1.2.0
fastjsonschema==2.17.1
filelock==3.12.2
fqdn==1.5.1
identify==2.5.24
idna==3.4
importlib-metadata==6.7.0
importlib-resources==5.12.0
iniconfig==2.0.0
ipykernel==6.23.2
ipython==8.12.2
isoduration==20.11.0
isort==5.12.0
jedi==0.18.2
Jinja2==3.1.2
json5==0.9.14
jsonpointer==2.4
jsonschema==4.17.3
jupyter-events==0.6.3
jupyter-lsp==2.2.0
jupyter_client==8.2.0
jupyter_core==5.3.1
jupyter_server==2.6.0
jupyter_server_terminals==0.4.4
jupyterlab==4.0.2
jupyterlab-pygments==0.2.2
jupyterlab_server==2.23.0
lazy-object-proxy==1.9.0
loguru==0.7.0
markdown-it-py==2.2.0
MarkupSafe==2.1.3
matplotlib-inline==0.1.6
mccabe==0.7.0
mdurl==0.1.2
mistune==3.0.1
mypy-extensions==1.0.0
nbclient==0.8.0
nbconvert==7.6.0
nbformat==5.9.0
nest-asyncio==1.5.6
nodeenv==1.8.0
notebook_shim==0.2.3
numpy==1.24.3
openpyxl==3.1.2
overrides==7.3.1
packaging==23.1
Jinja2==3.0.3
azure-storage-blob==12.16.0
azure-keyvault-secrets==4.7.0
scikit-learn==1.2.1
great_expectations==0.15.47
pandas==2.0.2
pandocfilters==1.5.0
parso==0.8.3
pathspec==0.11.1
pexpect==4.8.0
pickleshare==0.7.5
pkgutil_resolve_name==1.3.10
platformdirs==3.5.1
pluggy==1.0.0
pre-commit==3.3.2
prometheus-client==0.17.0
prompt-toolkit==3.0.38
psutil==5.9.5
ptyprocess==0.7.0
pure-eval==0.2.2
pycparser==2.21
Pygments==2.15.1
pylint==2.17.4
pyrsistent==0.19.3
pytest==7.3.1
python-dateutil==2.8.2
python-json-logger==2.0.7
pytz==2023.3
PyYAML==6.0
pyzmq==25.1.0
requests==2.31.0
rfc3339-validator==0.1.4
rfc3986-validator==0.1.1
rich==13.4.1
Send2Trash==1.8.2
shellingham==1.5.0.post1
six==1.16.0
sniffio==1.3.0
soupsieve==2.4.1
stack-data==0.6.2
terminado==0.17.1
tinycss2==1.2.1
toml==0.10.2
tomli==2.0.1
tomlkit==0.11.8
tornado==6.3.2
traitlets==5.9.0
typer==0.9.0
typing_extensions==4.6.3
tzdata==2023.3
uri-template==1.3.0
urllib3==2.0.3
virtualenv==20.23.0
wcwidth==0.2.6
webcolors==1.13
webencodings==0.5.1
websocket-client==1.6.0
wrapt==1.15.0
zipp==3.15.0
# Sphinx
# sphinx-rtd-theme
97 changes: 64 additions & 33 deletions notebooks/sandbox.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -82,59 +82,90 @@
},
{
"cell_type": "code",
"execution_count": 23,
"execution_count": 5,
"id": "94340c07-1598-4aa9-98fc-45ff005eaeaf",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'author': None,\n",
" 'author_email': 'Leonard Richardson <[email protected]>',\n",
"{'author': 'Thunder Shiviah, Michael Shtelma, Ivan Trusov',\n",
" 'author_email': '',\n",
" 'bugtrack_url': None,\n",
" 'classifiers': ['Development Status :: 5 - Production/Stable',\n",
" 'Intended Audience :: Developers',\n",
" 'License :: OSI Approved :: MIT License',\n",
" 'Programming Language :: Python',\n",
" 'Programming Language :: Python :: 3',\n",
" 'Topic :: Software Development :: Libraries :: Python Modules',\n",
" 'Topic :: Text Processing :: Markup :: HTML',\n",
" 'Topic :: Text Processing :: Markup :: SGML',\n",
" 'Topic :: Text Processing :: Markup :: XML'],\n",
" 'description': 'Beautiful Soup is a library that makes it easy to scrape information\\nfrom web pages. It sits atop an HTML or XML parser, providing Pythonic\\nidioms for iterating, searching, and modifying the parse tree.\\n\\n# Quick start\\n\\n```\\n>>> from bs4 import BeautifulSoup\\n>>> soup = BeautifulSoup(\"<p>Some<b>bad<i>HTML\")\\n>>> print(soup.prettify())\\n<html>\\n <body>\\n <p>\\n Some\\n <b>\\n bad\\n <i>\\n HTML\\n </i>\\n </b>\\n </p>\\n </body>\\n</html>\\n>>> soup.find(text=\"bad\")\\n\\'bad\\'\\n>>> soup.i\\n<i>HTML</i>\\n#\\n>>> soup = BeautifulSoup(\"<tag1>Some<tag2/>bad<tag3>XML\", \"xml\")\\n#\\n>>> print(soup.prettify())\\n<?xml version=\"1.0\" encoding=\"utf-8\"?>\\n<tag1>\\n Some\\n <tag2/>\\n bad\\n <tag3>\\n XML\\n </tag3>\\n</tag1>\\n```\\n\\nTo go beyond the basics, [comprehensive documentation is available](https://www.crummy.com/software/BeautifulSoup/bs4/doc/).\\n\\n# Links\\n\\n* [Homepage](https://www.crummy.com/software/BeautifulSoup/bs4/)\\n* [Documentation](https://www.crummy.com/software/BeautifulSoup/bs4/doc/)\\n* [Discussion group](https://groups.google.com/group/beautifulsoup/)\\n* [Development](https://code.launchpad.net/beautifulsoup/)\\n* [Bug tracker](https://bugs.launchpad.net/beautifulsoup/)\\n* [Complete changelog](https://bazaar.launchpad.net/~leonardr/beautifulsoup/bs4/view/head:/CHANGELOG)\\n\\n# Note on Python 2 sunsetting\\n\\nBeautiful Soup\\'s support for Python 2 was discontinued on December 31,\\n2020: one year after the sunset date for Python 2 itself. From this\\npoint onward, new Beautiful Soup development will exclusively target\\nPython 3. The final release of Beautiful Soup 4 to support Python 2\\nwas 4.9.3.\\n\\n# Supporting the project\\n\\nIf you use Beautiful Soup as part of your professional work, please consider a\\n[Tidelift subscription](https://tidelift.com/subscription/pkg/pypi-beautifulsoup4?utm_source=pypi-beautifulsoup4&utm_medium=referral&utm_campaign=readme).\\nThis will support many of the free software projects your organization\\ndepends on, not just Beautiful Soup.\\n\\nIf you use Beautiful Soup for personal projects, the best way to say\\nthank you is to read\\n[Tool Safety](https://www.crummy.com/software/BeautifulSoup/zine/), a zine I\\nwrote about what Beautiful Soup has taught me about software\\ndevelopment.\\n\\n# Building the documentation\\n\\nThe bs4/doc/ directory contains full documentation in Sphinx\\nformat. Run `make html` in that directory to create HTML\\ndocumentation.\\n\\n# Running the unit tests\\n\\nBeautiful Soup supports unit test discovery using Pytest:\\n\\n```\\n$ pytest\\n```\\n\\n',\n",
" 'classifiers': ['Intended Audience :: Developers',\n",
" 'Intended Audience :: System Administrators'],\n",
" 'description': '# dbx by Databricks Labs\\n\\n<p align=\"center\">\\n <a href=\"https://dbx.readthedocs.io/\">\\n <img src=\"https://raw.githubusercontent.com/databrickslabs/dbx/master/images/logo.svg\" class=\"align-center\" width=\"200\" height=\"200\" alt=\"logo\" />\\n </a>\\n</p>\\n\\n<p align=\"center\">\\n <b>🧱Databricks CLI eXtensions - aka <code>dbx</code> is a CLI tool for development and advanced Databricks workflows management.</b>\\n</p>\\n\\n---\\n\\n<p align=\"center\">\\n <a href=\"https://dbx.readthedocs.io/en/latest/?badge=latest\">\\n <img src=\"https://img.shields.io/readthedocs/dbx?style=for-the-badge\" alt=\"Documentation Status\"/>\\n </a>\\n <a href=\"https://pypi.org/project/dbx/\">\\n <img src=\"https://img.shields.io/pypi/v/dbx?color=green&amp;style=for-the-badge\" alt=\"Latest Python Release\"/>\\n </a>\\n <a href=\"https://codecov.io/gh/databrickslabs/dbx\">\\n <img src=\"https://img.shields.io/codecov/c/github/databrickslabs/dbx?style=for-the-badge&amp;token=S7ADH3W2E3\"\\n alt=\"codecov\"/>\\n </a>\\n <a href=\"https://pypistats.org/packages/dbx\">\\n <img src=\"https://img.shields.io/pypi/dm/dbx?style=for-the-badge\" alt=\"downloads\"/>\\n </a>\\n <a href=\"https://github.com/psf/black\">\\n <img src=\"https://img.shields.io/badge/code%20style-black-000000.svg?style=for-the-badge\"\\n alt=\"We use black for formatting\"/>\\n </a>\\n</p>\\n\\n---\\n\\n## Concept\\n\\n`dbx` simplifies Databricks workflows development, deployment and launch across multiple\\nenvironments. It also helps to package your project and deliver it to\\nyour Databricks environment in a versioned fashion. Designed in a\\nCLI-first manner, it is built to be actively used both inside CI/CD\\npipelines and as a part of local tooling for rapid prototyping.\\n\\n## Requirements\\n\\n- Python Version \\\\> 3.8\\n- `pip` or `conda`\\n\\n## Installation\\n\\n- with `pip`:\\n\\n```\\npip install dbx\\n```\\n\\n## Documentation\\n\\nPlease refer to the [docs page](https://dbx.readthedocs.io/en/latest/index.html).\\n\\n## Interface versioning\\n\\nFor CLI interfaces, we support [SemVer](https://semver.org/) approach.\\nHowever, for API components we don\\'t use SemVer as of now. This may lead\\nto instability when using `dbx` API methods directly.\\n\\n## Legal Information\\n\\nThis software is provided as-is and is not officially supported by\\nDatabricks through customer technical support channels. Support,\\nquestions, and feature requests can be communicated through the Issues\\npage of this repo. Please see the legal agreement and understand that\\nissues with the use of this code will not be answered or investigated by\\nDatabricks Support.\\n\\n## Feedback\\n\\nIssues with `dbx`? Found a bug? Have a great idea for an addition? Feel\\nfree to file an\\n[issue](https://github.com/databrickslabs/dbx/issues/new/choose).\\n\\n## Contributing\\n\\nPlease find more details about contributing to `dbx` in the contributing\\n[doc](https://github.com/databrickslabs/dbx/blob/master/contrib/CONTRIBUTING.md).\\n',\n",
" 'description_content_type': 'text/markdown',\n",
" 'docs_url': None,\n",
" 'download_url': None,\n",
" 'download_url': '',\n",
" 'downloads': {'last_day': -1, 'last_month': -1, 'last_week': -1},\n",
" 'home_page': None,\n",
" 'keywords': 'HTML,XML,parse,soup',\n",
" 'license': None,\n",
" 'maintainer': None,\n",
" 'maintainer_email': None,\n",
" 'name': 'beautifulsoup4',\n",
" 'package_url': 'https://pypi.org/project/beautifulsoup4/',\n",
" 'home_page': '',\n",
" 'keywords': '',\n",
" 'license': 'Databricks License',\n",
" 'maintainer': '',\n",
" 'maintainer_email': '',\n",
" 'name': 'dbx',\n",
" 'package_url': 'https://pypi.org/project/dbx/',\n",
" 'platform': None,\n",
" 'project_url': 'https://pypi.org/project/beautifulsoup4/',\n",
" 'project_urls': {'Download': 'https://www.crummy.com/software/BeautifulSoup/bs4/download/',\n",
" 'Homepage': 'https://www.crummy.com/software/BeautifulSoup/bs4/'},\n",
" 'release_url': 'https://pypi.org/project/beautifulsoup4/4.12.2/',\n",
" 'requires_dist': ['soupsieve>1.2',\n",
" \"html5lib; extra == 'html5lib'\",\n",
" \"lxml; extra == 'lxml'\"],\n",
" 'requires_python': '>=3.6.0',\n",
" 'summary': 'Screen-scraping library',\n",
" 'version': '4.12.2',\n",
" 'project_url': 'https://pypi.org/project/dbx/',\n",
" 'project_urls': None,\n",
" 'release_url': 'https://pypi.org/project/dbx/0.8.17/',\n",
" 'requires_dist': ['requests (<3.0.0,>=2.30.1)',\n",
" 'mlflow-skinny (<3.0.0,>=2.0.0)',\n",
" 'databricks-cli (<0.18,>=0.17)',\n",
" 'tenacity (<=9.0.0,>=8.2.2)',\n",
" 'click (<9.0.0,>=8.1.0)',\n",
" 'rich (==12.6.0)',\n",
" 'typer[all] (==0.7.0)',\n",
" 'cookiecutter (<3.0.0,>2.1.0)',\n",
" 'pyyaml (>=6.0)',\n",
" 'pydantic (<2.0.0,>=1.10.8)',\n",
" 'Jinja2 (>=2.11.2)',\n",
" 'cryptography (<42.0.0,>=41.0.0)',\n",
" 'aiohttp (>=3.8.2)',\n",
" 'pathspec (>=0.9.0)',\n",
" 'watchdog (>=2.1.0)',\n",
" \"boto3 (<2,>=1.26.13) ; extra == 'aws'\",\n",
" \"azure-storage-blob (<13.0.0,>=12.14.1) ; extra == 'azure'\",\n",
" \"azure-identity (<2.0.0,>=1.12.0) ; extra == 'azure'\",\n",
" \"mkdocs (<2.0.0,>=1.1.2) ; extra == 'dev'\",\n",
" \"mkdocs-click (<1.0,>=0.8.0) ; extra == 'dev'\",\n",
" \"mkdocs-material (<10.0.0,>=9.0.8) ; extra == 'dev'\",\n",
" \"mdx-include (<2.0.0,>=1.4.1) ; extra == 'dev'\",\n",
" \"mkdocs-markdownextradata-plugin (<0.3.0,>=0.1.7) ; extra == 'dev'\",\n",
" \"mkdocs-glightbox (<1.0,>=0.2.1) ; extra == 'dev'\",\n",
" \"mkdocs-git-revision-date-localized-plugin (<=2.0,>=1.1.0) ; extra == 'dev'\",\n",
" \"pre-commit (<4.0.0,>=2.20.0) ; extra == 'dev'\",\n",
" \"pylint (==2.15.6) ; extra == 'dev'\",\n",
" \"pycodestyle (==2.8.0) ; extra == 'dev'\",\n",
" \"pyflakes (==2.5.0) ; extra == 'dev'\",\n",
" \"mccabe (==0.6.1) ; extra == 'dev'\",\n",
" \"prospector (==1.7.7) ; extra == 'dev'\",\n",
" \"black (<23.0.0,>=22.3.0) ; extra == 'dev'\",\n",
" \"MarkupSafe (<3.0.0,>=2.1.1) ; extra == 'dev'\",\n",
" \"pytest (<8.0.0,>=7.1.3) ; extra == 'dev'\",\n",
" \"pytest-mock (<3.11.0,>=3.8.2) ; extra == 'dev'\",\n",
" \"pytest-xdist[psutil] (<3.0.0,>=2.5.0) ; extra == 'dev'\",\n",
" \"pytest-asyncio (<1.0.0,>=0.18.3) ; extra == 'dev'\",\n",
" \"pytest-cov (<5.0.0,>=4.0.0) ; extra == 'dev'\",\n",
" \"pytest-timeout (<3.0.0,>=2.1.0) ; extra == 'dev'\",\n",
" \"pytest-clarity (<2.0.0,>=1.0.1) ; extra == 'dev'\",\n",
" \"poetry (>=1.2.0) ; extra == 'dev'\",\n",
" \"google-cloud-storage (<3.0.0,>=2.6.0) ; extra == 'gcp'\"],\n",
" 'requires_python': '>=3.8',\n",
" 'summary': 'DataBricks CLI eXtensions aka dbx',\n",
" 'version': '0.8.17',\n",
" 'yanked': False,\n",
" 'yanked_reason': None}"
]
},
"execution_count": 23,
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data = get_raw_data(\"beautifulsoup4\")\n",
"data = get_raw_data(\"dbx\")\n",
"data"
]
},
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "pymetasnap"
version = "0.2.2"
version = "0.2.3"
description = "This package allows you to scrape metadata from the Python Package Index"
authors = ["cristian-rincon <[email protected]>"]
license = "MIT"
Expand Down
Loading

0 comments on commit 82ad028

Please sign in to comment.