From 5a509aba43f6873982f42bfe6087d311e03d0ba9 Mon Sep 17 00:00:00 2001 From: Gaurav Sheni Date: Thu, 15 Dec 2022 10:55:56 -0500 Subject: [PATCH 01/11] remove setup.cfg setup.py --- .flake8 | 8 +++ dev-requirements.txt | 15 ----- pyproject.toml | 139 ++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 5 -- setup.cfg | 18 ------ setup.py | 38 ------------ test-requirements.txt | 3 - 7 files changed, 147 insertions(+), 79 deletions(-) create mode 100644 .flake8 delete mode 100644 dev-requirements.txt create mode 100644 pyproject.toml delete mode 100755 requirements.txt delete mode 100755 setup.cfg delete mode 100755 setup.py delete mode 100755 test-requirements.txt diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..b81e04f --- /dev/null +++ b/.flake8 @@ -0,0 +1,8 @@ +[flake8] +exclude = docs/* +max-line-length = 88 +extend-ignore = E203 +ignore = E501,W504,W503 +per-file-ignores = + **/__init__.py:F401 + **/tests/*:D \ No newline at end of file diff --git a/dev-requirements.txt b/dev-requirements.txt deleted file mode 100644 index 6745a13..0000000 --- a/dev-requirements.txt +++ /dev/null @@ -1,15 +0,0 @@ --r test-requirements.txt -codecov==2.1.8 -flake8==3.7.8 -autopep8==1.4.4 -isort==4.3.21 -nbsphinx==0.8.7 -pydata-sphinx-theme==0.7.1 -Sphinx==4.2.0 -nbconvert==6.4.5 -ipython==7.16.3 -pygments==2.8.1 -jupyter==1.0.0 -pandoc==1.0.2 -ipykernel==5.3.4 -matplotlib==3.2.2 \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..32e4e52 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,139 @@ +[project] +name = "autonormalize" +readme = "README.md" +description = "a library for automated table normalization" +dynamic = ["version"] +classifiers = [ + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Science/Research", + "Intended Audience :: Developers", + "Topic :: Software Development", + "Topic :: Scientific/Engineering", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Operating System :: Microsoft :: Windows", + "Operating System :: POSIX", + "Operating System :: Unix", + "Operating System :: MacOS", +] +authors = [ + {name="Alteryx, Inc.", email="open_source_support@alteryx.com"} +] +maintainers = [ + {name="Alteryx, Inc.", email="open_source_support@alteryx.com"} +] +keywords = ["feature engineering", "data science", "machine learning", "normalization"] +license = {text = "BSD 3-clause"} +requires-python = ">=3.8,<4" +dependencies = [ + "featuretools >= 1.0.0", + "numpy >= 1.13.3", + "pandas >= 0.23.0", + "tqdm >= 4.19.2", + "graphviz >= 0.8.4", +] + +[project.urls] +"Source Code"= "https://github.com/alteryx/autonormalize/" +"Issue Tracker" = "https://github.com/alteryx/autonormalize/issues" +"Twitter" = "https://twitter.com/alteryxoss" +"Chat" = "https://join.slack.com/t/alteryx-oss/shared_invite/zt-182tyvuxv-NzIn6eiCEf8TBziuKp0bNA" + +[project.optional-dependencies] +test = [ + "pytest >= 5.2.0", + "pytest-cov >= 2.6.1", + "pytest-xdist >= 1.26.1", +] +dev = [ + "flake8 == 5.0.4", + "isort == 5.10.1", + "black[jupyter] == 22.6.0", + "pre-commit == 2.20.0", + "nbsphinx == 0.8.10", + "pydata-sphinx-theme == 0.12.0", + "Sphinx == 5.3.0", + "nbconvert == 7.2.6", + "ipython == 8.7.0", + "pygments == 2.13.0", + "jupyter == 1.0.0", + "pandoc == 2.3", + "ipykernel == 6.19.2", + "matplotlib == 3.6.2", + "featuretools[test]", +] +updater = [ + "alteryx-open-src-update-checker >= 2.1.0" +] + +[tool.setuptools] +include-package-data = true +license-files = [ + "LICENSE", +] + +[tool.setuptools.packages.find] +namespaces = true + +[tool.setuptools.package-data] +"*" = [ + "*.txt", + "README.md", +] + +[tool.setuptools.exclude-package-data] +"*" = [ + "* __pycache__", + "*.py[co]", + "docs/*" +] + +[tool.setuptools.dynamic] +version = {attr = "autonormalize.version.__version__"} + + +[tool.pytest.ini_options] +addopts = "--doctest-modules" +testpaths = [ + "autonormalize/tests/*" +] +filterwarnings = [ + "ignore::DeprecationWarning", + "ignore::PendingDeprecationWarning" +] + +[tool.isort] +profile = "black" +forced_separate = "autonormalize" +known_first_party = "autonormalize" +skip = "__init__.py" +filter_files = true +multi_line_output = 3 + +[tool.black] +target-version = ['py310'] + +[tool.coverage.run] +source = ["autonormalize"] + +[tool.coverage.report] +exclude_lines =[ + "pragma: no cover", + "def __repr__", + "raise AssertionError", + "raise NotImplementedError", + "if __name__ == .__main__.:", + "if self._verbose:", + "if verbose:", + "if profile:", + "pytest.skip" +] +[build-system] +requires = [ + "setuptools >= 61.0.0", + "wheel" +] +build-backend = "setuptools.build_meta" \ No newline at end of file diff --git a/requirements.txt b/requirements.txt deleted file mode 100755 index 474fd7e..0000000 --- a/requirements.txt +++ /dev/null @@ -1,5 +0,0 @@ -featuretools>=1.0.0 -numpy>=1.13.3 -pandas>=0.23.0 -tqdm>=4.19.2 -graphviz>=0.8.4 diff --git a/setup.cfg b/setup.cfg deleted file mode 100755 index 568fef5..0000000 --- a/setup.cfg +++ /dev/null @@ -1,18 +0,0 @@ -[metadata] -description-file = README.md -[tool:pytest] -addopts = --doctest-modules -python_files = autonormalize/tests/* -filterwarnings = - ignore::DeprecationWarning - ignore::PendingDeprecationWarning -[flake8] -exclude = docs/* -ignore = E501,W504 -per-file-ignores = - **/__init__.py:F401 -[aliases] -test=pytest -[isort] -multi_line_output=3 -known_third_party=numpy,pandas,pytest diff --git a/setup.py b/setup.py deleted file mode 100755 index 425447c..0000000 --- a/setup.py +++ /dev/null @@ -1,38 +0,0 @@ -from os import path - -from setuptools import find_packages, setup - -dirname = path.abspath(path.dirname(__file__)) -with open(path.join(dirname, 'README.md')) as f: - long_description = f.read() - -setup( - name='autonormalize', - version='2.0.1', - description='a library for automated table normalization', - url='https://github.com/alteryx/autonormalize', - license='BSD 3-clause', - author='Alteryx, Inc.', - author_email='support@featurelabs.com', - classifiers=[ - 'Development Status :: 3 - Alpha', - 'Intended Audience :: Developers', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.7', - 'Programming Language :: Python :: 3.8', - 'Programming Language :: Python :: 3.9', - ], - install_requires=open('requirements.txt').readlines(), - tests_require=open('test-requirements.txt').readlines(), - python_requires='>=3.7, <4', - test_suite='autonormalize/tests', - packages=find_packages(), - include_package_data=True, - entry_points={ - "featuretools_plugin": [ - 'autonormalize = autonormalize', - ], - }, - long_description=long_description, - long_description_content_type='text/markdown', -) diff --git a/test-requirements.txt b/test-requirements.txt deleted file mode 100755 index 5c9c48c..0000000 --- a/test-requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -pytest>=5.2.0 -pytest-cov>=2.6.1 -pytest-xdist>=1.26.1 From 90f27f8b6687bc55787761cadcd56ec092591a6c Mon Sep 17 00:00:00 2001 From: Gaurav Sheni Date: Thu, 15 Dec 2022 10:56:20 -0500 Subject: [PATCH 02/11] add pre commit config --- .pre-commit-config.yaml | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 .pre-commit-config.yaml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..a86d4fb --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,37 @@ +exclude: ^LICENSE/|\.(html|csv|svg|md|txt|json)$ +default_stages: [commit] +repos: + - repo: https://github.com/kynan/nbstripout + rev: 0.5.0 + hooks: + - id: nbstripout + entry: nbstripout + language: python + types: [jupyter] + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.3.0 + hooks: + - id: end-of-file-fixer + - id: trailing-whitespace + - repo: https://github.com/MarcoGorelli/absolufy-imports + rev: v0.3.1 + hooks: + - id: absolufy-imports + files: ^autonormalize/ + - repo: https://github.com/asottile/add-trailing-comma + rev: v2.2.3 + hooks: + - id: add-trailing-comma + name: Add trailing comma + - repo: https://github.com/PyCQA/isort + rev: 5.10.1 + hooks: + - id: isort + args: [--settings-path=./pyproject.toml] + - repo: https://github.com/python/black + rev: 22.6.0 + hooks: + - id: black + args: [--target-version=py310] + additional_dependencies: [".[jupyter]"] + types_or: [python, jupyter] \ No newline at end of file From ec70dd37670835e844c803918725d54c4cc06578 Mon Sep 17 00:00:00 2001 From: Gaurav Sheni Date: Thu, 15 Dec 2022 10:59:18 -0500 Subject: [PATCH 03/11] lint --- .github/workflows/lint_check.yml | 10 +- .pre-commit-config.yaml | 2 +- Makefile | 44 ++- autonormalize/__init__.py | 2 +- autonormalize/autonormalize.py | 16 +- autonormalize/classes.py | 17 +- autonormalize/dfd.py | 22 +- autonormalize/examples/example_data_gen.py | 4 +- autonormalize/normalize.py | 28 +- autonormalize/tests/test_classes.py | 290 +++++++++++++----- autonormalize/tests/test_dfd.py | 178 +++++++++-- autonormalize/tests/test_example.py | 51 ++- autonormalize/tests/test_normalize.py | 258 +++++++++++----- docs/source/conf.py | 119 ++++--- docs/source/guides/demo/food/__init__.py | 5 +- docs/source/guides/demo/liquor/__init__.py | 5 +- docs/source/guides/editing_dependencies.ipynb | 20 +- docs/source/guides/kaggle_food_dataset.ipynb | 7 +- .../guides/kaggle_liquor_sales_dataset.ipynb | 8 +- 19 files changed, 771 insertions(+), 315 deletions(-) diff --git a/.github/workflows/lint_check.yml b/.github/workflows/lint_check.yml index 4e402e5..72a6183 100644 --- a/.github/workflows/lint_check.yml +++ b/.github/workflows/lint_check.yml @@ -12,14 +12,14 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python_version: ["3.9"] + python_version: ["3.11"] steps: - name: Set up python ${{ matrix.python_version }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v3 with: python-version: ${{ matrix.python_version }} - name: Checkout repository - uses: actions/checkout@v2 + uses: actions/checkout@v4 with: ref: ${{ github.event.pull_request.head.ref }} repository: ${{ github.event.pull_request.head.repo.full_name }} @@ -27,8 +27,6 @@ jobs: run: | pip config --site set global.progress_bar off python -m pip install --upgrade pip - python -m pip install -e . - python -m pip install -r dev-requirements.txt - python -m pip install -r test-requirements.txt + python -m pip install -e .[test] - name: Run lint test run: make lint diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index a86d4fb..e84f795 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -32,6 +32,6 @@ repos: rev: 22.6.0 hooks: - id: black - args: [--target-version=py310] + args: [--target-version=py311] additional_dependencies: [".[jupyter]"] types_or: [python, jupyter] \ No newline at end of file diff --git a/Makefile b/Makefile index b9126b7..b692a93 100755 --- a/Makefile +++ b/Makefile @@ -4,37 +4,51 @@ clean: find . -name '*.pyc' -delete find . -name __pycache__ -delete find . -name '*~' -delete + find . -name '.coverage.*' -delete .PHONY: entry-point-test entry-point-test: - cd ~ && python -c "from featuretools import autonormalize" + cd ~ && python -c "from autonormalize import autonormalize" .PHONY: lint lint: - flake8 autonormalize && isort --check-only autonormalize + isort --check-only autonormalize + black autonormalize docs/source -t py311 --check + flake8 autonormalize .PHONY: lint-fix lint-fix: - autopep8 --in-place --recursive --max-line-length=100 --exclude="*/migrations/*" --select="E225,E303,E302,E203,E128,E231,E251,E271,E127,E126,E301,W291,W293,E226,E306,E221,E261,E111,E114" autonormalize + black autonormalize docs/source -t py311 isort autonormalize .PHONY: test test: lint - pytest autonormalize/ + pytest autonormalize/ -n auto .PHONY: testcoverage testcoverage: lint - pytest autonormalize/ --cov=autonormalize + pytest autonormalize/ -n auto --cov=autonormalize .PHONY: installdeps -installdeps: +installdeps: upgradepip pip install --upgrade pip - pip install -e . - pip install -r dev-requirements.txt - -.PHONY: package_autonormalize -package_autonormalize: - python setup.py sdist - $(eval DT_VERSION=$(shell python setup.py --version)) - tar -zxvf "dist/autonormalize-${DT_VERSION}.tar.gz" - mv "autonormalize-${DT_VERSION}" unpacked_sdist \ No newline at end of file + pip install -e ".[dev]" + +.PHONY: upgradepip +upgradepip: + python -m pip install --upgrade pip + +.PHONY: upgradebuild +upgradebuild: + python -m pip install --upgrade build + +.PHONY: upgradesetuptools +upgradesetuptools: + python -m pip install --upgrade setuptools + +.PHONY: package +package: upgradepip upgradebuild upgradesetuptools + python -m build + $(eval PACKAGE=$(shell python -c "from pep517.meta import load; metadata = load('.'); print(metadata.version)")) + tar -zxvf "dist/autonormalize-${PACKAGE}.tar.gz" + mv "autonormalize-${PACKAGE}" unpacked_sdist \ No newline at end of file diff --git a/autonormalize/__init__.py b/autonormalize/__init__.py index 3f860a0..64c3900 100755 --- a/autonormalize/__init__.py +++ b/autonormalize/__init__.py @@ -2,4 +2,4 @@ from .autonormalize import * from .classes import Dependencies -__version__ = '2.0.1' +__version__ = "2.0.1" diff --git a/autonormalize/autonormalize.py b/autonormalize/autonormalize.py index 278b315..70d3a92 100644 --- a/autonormalize/autonormalize.py +++ b/autonormalize/autonormalize.py @@ -92,7 +92,7 @@ def make_entityset(df, dependencies, name=None, time_index=None): while stack != []: current = stack.pop() - if (current.df.ww.schema is None): + if current.df.ww.schema is None: current.df.ww.init(index=current.index[0], name=current.index[0]) current_df_name = current.df.ww.name @@ -101,13 +101,15 @@ def make_entityset(df, dependencies, name=None, time_index=None): else: dataframes[current_df_name] = (current.df, current.index[0]) for child in current.children: - if (child.df.ww.schema is None): + if child.df.ww.schema is None: child.df.ww.init(index=child.index[0], name=child.index[0]) child_df_name = child.df.ww.name # add to stack # add relationship stack.append(child) - relationships.append((child_df_name, child.index[0], current_df_name, child.index[0])) + relationships.append( + (child_df_name, child.index[0], current_df_name, child.index[0]) + ) return ft.EntitySet(name, dataframes, relationships) @@ -163,10 +165,12 @@ def normalize_entityset(es, accuracy=0.98): # to normalize while preserving existing relationships if len(es.dataframes) > 1: - raise ValueError('There is more than one dataframe in this EntitySet') + raise ValueError("There is more than one dataframe in this EntitySet") if len(es.dataframes) == 0: - raise ValueError('This EntitySet is empty') + raise ValueError("This EntitySet is empty") df = es.dataframes[0] - new_es = auto_entityset(df, accuracy, index=df.ww.index, name=es.id, time_index=df.ww.time_index) + new_es = auto_entityset( + df, accuracy, index=df.ww.index, name=es.id, time_index=df.ww.time_index + ) return new_es diff --git a/autonormalize/classes.py b/autonormalize/classes.py index 2c65942..ba1b6e9 100644 --- a/autonormalize/classes.py +++ b/autonormalize/classes.py @@ -305,8 +305,16 @@ def __hash__(self): return id(self) def __str__(self): - return str({"attributes": str(self.attrs), "visited": self.visited, - "category": self.category, "prev": self.prev, "next": self.next, "loc": id(self)}) + return str( + { + "attributes": str(self.attrs), + "visited": self.visited, + "category": self.category, + "prev": self.prev, + "next": self.next, + "loc": id(self), + } + ) class Dependencies(object): @@ -480,7 +488,7 @@ def remove_implied_extroneous(self): i = 0 while i < len(self._data[rhs]): - if self._data[rhs][i] in self._data[rhs][i + 1:]: + if self._data[rhs][i] in self._data[rhs][i + 1 :]: self._data[rhs].pop(i) else: i += 1 @@ -667,6 +675,7 @@ def find_closure(rel, attrs): closure (set[str]) : attrs' closure, aka the attributes that can be determined from the attributes in attrs """ + def helper(set_attr, rel): if rel == []: return set(set_attr) @@ -676,6 +685,7 @@ def helper(set_attr, rel): rel_.remove((dep_attrs, dep)) return helper(set_attr + [dep], rel_) return set_attr + return set(helper(attrs[:], rel)) @@ -724,6 +734,7 @@ def get_mask(self, col, val): return self._masks[col][val] return None + # class BitIndexSet(object): # """ # A BitIndexSet represents a set where each of the elements are an integer. diff --git a/autonormalize/dfd.py b/autonormalize/dfd.py index d6a39ba..a28e473 100644 --- a/autonormalize/dfd.py +++ b/autonormalize/dfd.py @@ -96,7 +96,9 @@ def find_LHSs(rhs, attrs, df, partitions, accuracy, masks): else: node.infer_type() if node.category == 0: - if compute_partitions(df, rhs, node.attrs, partitions, accuracy, masks): + if compute_partitions( + df, rhs, node.attrs, partitions, accuracy, masks + ): if node.is_minimal(): min_deps.add_dep(node.attrs) node.category = 2 @@ -112,7 +114,9 @@ def find_LHSs(rhs, attrs, df, partitions, accuracy, masks): node = pick_next_node(node, trace, min_deps, max_non_deps, df.columns) - seeds = nodes_from_seeds(sorted(generate_next_seeds(max_non_deps, min_deps, lhs_attrs))) + seeds = nodes_from_seeds( + sorted(generate_next_seeds(max_non_deps, min_deps, lhs_attrs)) + ) return min_deps @@ -353,9 +357,15 @@ def approximate_dependencies(lhs_set, rhs, df, accuracy, masks): if df_lhs_rhs.shape[0] - df_lhs.shape[0] > limit: return False - merged = df_lhs.merge(df_lhs_rhs, indicator=True, how='outer') # create new df that is the merge of df_one and df_two - indicator = merged[merged['_merge'] == 'right_only'] # filter out the rows that were only on the right side (the rows that are preventing the two dataframes from being equal) - indicator = indicator.drop_duplicates(lhs_set) # find unique combinations of columns in LHS_set that characterize the disrepencies (have 2+ different values in rhs column) + merged = df_lhs.merge( + df_lhs_rhs, indicator=True, how="outer" + ) # create new df that is the merge of df_one and df_two + indicator = merged[ + merged["_merge"] == "right_only" + ] # filter out the rows that were only on the right side (the rows that are preventing the two dataframes from being equal) + indicator = indicator.drop_duplicates( + lhs_set + ) # find unique combinations of columns in LHS_set that characterize the disrepencies (have 2+ different values in rhs column) acc = 0 for index, row in indicator.iterrows(): @@ -365,7 +375,7 @@ def approximate_dependencies(lhs_set, rhs, df, accuracy, masks): m = masks.get_mask(attr, row[attr]) if m is None: - if df[attr].dtypes.name == 'datetime64[ns]': + if df[attr].dtypes.name == "datetime64[ns]": m = df[attr] == row[attr] else: m = df[attr].values == row[attr] diff --git a/autonormalize/examples/example_data_gen.py b/autonormalize/examples/example_data_gen.py index fd46d01..59a1c0b 100644 --- a/autonormalize/examples/example_data_gen.py +++ b/autonormalize/examples/example_data_gen.py @@ -3,7 +3,7 @@ def generate_example_3(): - csvData = [['A', 'B', 'C', 'D', 'E', 'F', 'G']] + csvData = [["A", "B", "C", "D", "E", "F", "G"]] for x in range(100000): b = random.randint(0, 25) @@ -14,7 +14,7 @@ def generate_example_3(): elem[x] = str(elem[x]) csvData.append(elem) - with open('example_3', 'w') as csvFile: + with open("example_3", "w") as csvFile: writer = csv.writer(csvFile) writer.writerows(csvData) diff --git a/autonormalize/normalize.py b/autonormalize/normalize.py index 40f1c26..bb730f8 100644 --- a/autonormalize/normalize.py +++ b/autonormalize/normalize.py @@ -80,8 +80,8 @@ def make_indexes(depdf): if len(prim_key) > 1: - depdf.df.insert(0, '_'.join(prim_key), range(0, len(depdf.df))) - depdf.index = ['_'.join(prim_key)] + depdf.df.insert(0, "_".join(prim_key), range(0, len(depdf.df))) + depdf.index = ["_".join(prim_key)] # now need to replace it in the parent df... if depdf.parent is not None: @@ -99,13 +99,15 @@ def make_indexes(depdf): else: mask = mask & m - new_val = depdf.df[mask]['_'.join(prim_key)].item() + new_val = depdf.df[mask]["_".join(prim_key)].item() for index in indices[name]: add[index] = new_val depdf.parent.df.drop(columns=prim_key, inplace=True) - depdf.parent.df.insert(len(depdf.parent.df.columns), '_'.join(prim_key), add) + depdf.parent.df.insert( + len(depdf.parent.df.columns), "_".join(prim_key), add + ) for child in depdf.children: make_indexes(child) @@ -146,7 +148,9 @@ def split_up(split_on, depdf): parent_deps, child_deps = split_on_dep(split_on, depdf.deps) child = DepDF(child_deps, form_child(depdf.df, child_deps), split_on, depdf) depdf.deps = parent_deps - depdf.df = depdf.df.drop(columns=list(set(depdf.df.columns).difference(parent_deps.all_attrs()))) + depdf.df = depdf.df.drop( + columns=list(set(depdf.df.columns).difference(parent_deps.all_attrs())) + ) depdf.children.append(child) normalize_dataframe(depdf) normalize_dataframe(child) @@ -304,7 +308,10 @@ def split_on_dep(lhs_dep, dependencies): if len(old_rhs.intersection(lhs)) != 0: new_deps[rhs].remove(lhs) - return (Dependencies(old_deps, dependencies.get_prim_key()), Dependencies(new_deps, lhs_dep)) + return ( + Dependencies(old_deps, dependencies.get_prim_key()), + Dependencies(new_deps, lhs_dep), + ) def drop_primary_dups(df, prim_key): @@ -355,7 +362,12 @@ def choose_index(keys, df): options = [key for key in sort_key if len(key) == m] for key in options: for attr in key: - if "_id" in attr.lower() or " id" in attr.lower() or "id _" in attr.lower() or "id " in attr.lower(): + if ( + "_id" in attr.lower() + or " id" in attr.lower() + or "id _" in attr.lower() + or "id " in attr.lower() + ): return list(key) if df is None: @@ -381,6 +393,6 @@ def filter(keys, df): """ for key, rhs in keys[:]: for attr in key: - if df[attr].dtypes.name not in set(['category', 'int64', 'object']): + if df[attr].dtypes.name not in set(["category", "int64", "object"]): keys.remove((key, rhs)) break diff --git a/autonormalize/tests/test_classes.py b/autonormalize/tests/test_classes.py index f9c923f..d783d1a 100644 --- a/autonormalize/tests/test_classes.py +++ b/autonormalize/tests/test_classes.py @@ -1,9 +1,4 @@ -from autonormalize.classes import ( - Dependencies, - DfdDependencies, - LHSs, - find_closure -) +from autonormalize.classes import Dependencies, DfdDependencies, LHSs, find_closure def assert_equal_dependency_dics(dep1, dep2): @@ -31,114 +26,224 @@ def assert_equal_tuple_rels(rels1, rels2): def test_all_sets_and_add_dep(): - lhss = LHSs(['a', 'b', 'c', 'd', 'e', 'f', 'g']) + lhss = LHSs(["a", "b", "c", "d", "e", "f", "g"]) assert lhss.all_sets() == set() - set_1 = frozenset(['a', 'c', 'd']) + set_1 = frozenset(["a", "c", "d"]) lhss.add_dep(set_1) assert lhss.all_sets() == set([set_1]) - set_2 = frozenset(['a', 'c', 'e', 'f', 'g']) - set_3 = frozenset(['b']) + set_2 = frozenset(["a", "c", "e", "f", "g"]) + set_3 = frozenset(["b"]) lhss.add_dep(set_2) lhss.add_dep(set_3) assert lhss.all_sets() == set([set_1, set_2, set_3]) def test_contains_subset(): - lhss = LHSs(['a', 'b', 'c', 'd', 'e', 'f', 'g']) - set_1 = frozenset(['a', 'c', 'd']) + lhss = LHSs(["a", "b", "c", "d", "e", "f", "g"]) + set_1 = frozenset(["a", "c", "d"]) lhss.add_dep(set_1) - set_2 = frozenset(['a', 'c', 'e', 'f', 'g']) - set_3 = frozenset(['g']) + set_2 = frozenset(["a", "c", "e", "f", "g"]) + set_3 = frozenset(["g"]) lhss.add_dep(set_2) lhss.add_dep(set_3) assert lhss.contains_subset(set_1) - assert lhss.contains_subset(frozenset(['a', 'c', 'd', 'f'])) - assert not lhss.contains_subset(frozenset(['b'])) + assert lhss.contains_subset(frozenset(["a", "c", "d", "f"])) + assert not lhss.contains_subset(frozenset(["b"])) def test_contains_superset(): - lhss = LHSs(['a', 'b', 'c', 'd', 'e', 'f', 'g']) - set_1 = frozenset(['a', 'c', 'd', 'e', 'g']) + lhss = LHSs(["a", "b", "c", "d", "e", "f", "g"]) + set_1 = frozenset(["a", "c", "d", "e", "g"]) lhss.add_dep(set_1) - set_2 = frozenset(['a', 'c', 'e', 'f']) - set_3 = frozenset(['b', 'c']) + set_2 = frozenset(["a", "c", "e", "f"]) + set_3 = frozenset(["b", "c"]) lhss.add_dep(set_2) lhss.add_dep(set_3) assert lhss.contains_superset(set_1) - assert lhss.contains_superset(frozenset(['a', 'c', 'f'])) - assert not lhss.contains_superset(frozenset(['a', 'b', 'c'])) + assert lhss.contains_superset(frozenset(["a", "c", "f"])) + assert not lhss.contains_superset(frozenset(["a", "b", "c"])) def test_LHSs_add_dep_and_all_sets(): - lhss = LHSs(['a', 'b', 'c', 'd', 'e', 'f', 'g']) + lhss = LHSs(["a", "b", "c", "d", "e", "f", "g"]) assert lhss.all_sets() == set() - set_1 = frozenset(['a', 'c', 'd', 'e', 'g']) + set_1 = frozenset(["a", "c", "d", "e", "g"]) lhss.add_dep(set_1) assert lhss.all_sets() == set([set_1]) - set_2 = frozenset(['a', 'c', 'e', 'f']) + set_2 = frozenset(["a", "c", "e", "f"]) lhss.add_dep(set_2) assert lhss.all_sets() == set([set_1, set_2]) - set_3 = frozenset(['b', 'c']) + set_3 = frozenset(["b", "c"]) lhss.add_dep(set_3) assert lhss.all_sets() == set([set_1, set_2, set_3]) def test_add_unique_lhs(): - dependencies = DfdDependencies(["name", "age", "height", "weight", "location", "speed", "rating", "experience", "mother"]) + dependencies = DfdDependencies( + [ + "name", + "age", + "height", + "weight", + "location", + "speed", + "rating", + "experience", + "mother", + ] + ) dependencies.add_unique_lhs("name") - assert_equal_dependency_dics(dependencies.serialize(), {"rating": [["name"]], "age": [["name"]], - "height": [["name"]], "weight": [["name"]], - "location": [["name"]], "speed": [["name"]], - "experience": [["name"]], "mother": [["name"]], "name": []}) + assert_equal_dependency_dics( + dependencies.serialize(), + { + "rating": [["name"]], + "age": [["name"]], + "height": [["name"]], + "weight": [["name"]], + "location": [["name"]], + "speed": [["name"]], + "experience": [["name"]], + "mother": [["name"]], + "name": [], + }, + ) def test_add_LHSs(): - lhss_weight = LHSs(set(["name", "age", "height", "weight", "location", "speed", "rating", "experience", "mother"])) + lhss_weight = LHSs( + set( + [ + "name", + "age", + "height", + "weight", + "location", + "speed", + "rating", + "experience", + "mother", + ] + ) + ) lhss_weight.add_dep(frozenset(["name"])) lhss_weight.add_dep(frozenset(["age", "height"])) - lhss_age = LHSs(set(["name", "age", "height", "weight", "location", "speed", "rating", "experience", "mother"])) + lhss_age = LHSs( + set( + [ + "name", + "age", + "height", + "weight", + "location", + "speed", + "rating", + "experience", + "mother", + ] + ) + ) lhss_age.add_dep(frozenset(["name"])) - dependencies = DfdDependencies(["name", "age", "height", "weight", "location", "speed", "rating", "experience", "mother"]) + dependencies = DfdDependencies( + [ + "name", + "age", + "height", + "weight", + "location", + "speed", + "rating", + "experience", + "mother", + ] + ) dependencies.add_LHSs("age", lhss_age) - assert_equal_dependency_dics(dependencies.serialize(), {"rating": [], "age": [["name"]], "height": [], "weight": [], - "location": [], "speed": [], - "experience": [], "mother": [], "name": []}) + assert_equal_dependency_dics( + dependencies.serialize(), + { + "rating": [], + "age": [["name"]], + "height": [], + "weight": [], + "location": [], + "speed": [], + "experience": [], + "mother": [], + "name": [], + }, + ) dependencies.add_LHSs("weight", lhss_weight) - assert_equal_dependency_dics(dependencies.serialize(), {"rating": [], "age": [["name"]], "height": [], - "weight": [["name"], ["age", "height"]], "location": [], "speed": [], - "experience": [], "mother": [], "name": []}) + assert_equal_dependency_dics( + dependencies.serialize(), + { + "rating": [], + "age": [["name"]], + "height": [], + "weight": [["name"], ["age", "height"]], + "location": [], + "speed": [], + "experience": [], + "mother": [], + "name": [], + }, + ) def test_add_and_remove_dep(): - dep_dic = {'A': [], 'B': [['A']], 'C': [['D', 'G'], ['A']], - 'D': [['A'], ['C', 'G']], 'E': [['D', 'G'], ['A'], ['C']], - 'F': [['A'], ['B']], 'G': [['A'], ['C', 'D']]} + dep_dic = { + "A": [], + "B": [["A"]], + "C": [["D", "G"], ["A"]], + "D": [["A"], ["C", "G"]], + "E": [["D", "G"], ["A"], ["C"]], + "F": [["A"], ["B"]], + "G": [["A"], ["C", "D"]], + } dependencies = Dependencies(dep_dic) - dependencies.add_dep('B', ['C']) - assert_equal_dependency_dics(dependencies.serialize(), - {'A': [], 'B': [['A'], ['C']], 'C': [['D', 'G'], ['A']], - 'D': [['A'], ['C', 'G']], 'E': [['D', 'G'], ['A'], ['C']], - 'F': [['A'], ['B']], 'G': [['A'], ['C', 'D']]}) - dependencies.remove_dep('B', ['C']) + dependencies.add_dep("B", ["C"]) + assert_equal_dependency_dics( + dependencies.serialize(), + { + "A": [], + "B": [["A"], ["C"]], + "C": [["D", "G"], ["A"]], + "D": [["A"], ["C", "G"]], + "E": [["D", "G"], ["A"], ["C"]], + "F": [["A"], ["B"]], + "G": [["A"], ["C", "D"]], + }, + ) + dependencies.remove_dep("B", ["C"]) assert_equal_dependency_dics(dependencies.serialize(), dep_dic) def test_tuple_relations(): - dep_dic = {'A': [], 'B': [['A']], 'C': [['D', 'G'], ['A']], - 'D': [['A'], ['C', 'G']], 'E': [['D', 'G'], ['A'], ['C']]} + dep_dic = { + "A": [], + "B": [["A"]], + "C": [["D", "G"], ["A"]], + "D": [["A"], ["C", "G"]], + "E": [["D", "G"], ["A"], ["C"]], + } dependencies = Dependencies(dep_dic) - tuple_comp = [(['D', 'G'], 'C'), (['A'], 'C'), (['A'], 'B'), - (['D', 'G'], 'E'), (['A'], 'E'), (['C'], 'E'), - (['A'], 'D'), (['C', 'G'], 'D')] + tuple_comp = [ + (["D", "G"], "C"), + (["A"], "C"), + (["A"], "B"), + (["D", "G"], "E"), + (["A"], "E"), + (["C"], "E"), + (["A"], "D"), + (["C", "G"], "D"), + ] assert_equal_tuple_rels(dependencies.tuple_relations(), tuple_comp) # assert set(dependencies.tuple_relations()) == set(tuple_comp) def test_remove_implied_extroneous(): - dep_dic = {'A': [], 'B': [['A']], 'C': [['A', 'B']]} + dep_dic = {"A": [], "B": [["A"]], "C": [["A", "B"]]} dependencies = Dependencies(dep_dic) dependencies.remove_implied_extroneous() - assert dependencies.serialize() == {'A': [], 'B': [['A']], 'C': [['A']]} + assert dependencies.serialize() == {"A": [], "B": [["A"]], "C": [["A"]]} # def test_remove_redundant(): @@ -150,37 +255,63 @@ def test_remove_implied_extroneous(): def test_find_candidate_keys(): dep_dic = { - 'A': [['B']], 'B': [['E'], ['A', 'D']], 'C': [['E', 'F']], - 'D': [['A']], 'E': [['A']], 'F': [['G']], 'G': []} + "A": [["B"]], + "B": [["E"], ["A", "D"]], + "C": [["E", "F"]], + "D": [["A"]], + "E": [["A"]], + "F": [["G"]], + "G": [], + } dependencies = Dependencies(dep_dic) dependencies.remove_implied_extroneous() - assert_equal_cand_keys(dependencies.find_candidate_keys(), [{'A', 'G'}, {'B', 'G'}, {'E', 'G'}]) + assert_equal_cand_keys( + dependencies.find_candidate_keys(), [{"A", "G"}, {"B", "G"}, {"E", "G"}] + ) def test_find_partial_deps(): dep_dic = { - 'A': [['B']], 'B': [['E'], ['A', 'D']], 'C': [['E', 'F']], - 'D': [['A']], 'E': [['A']], 'F': [['G']], 'G': []} - dependencies = Dependencies(dep_dic, ['A', 'G']) + "A": [["B"]], + "B": [["E"], ["A", "D"]], + "C": [["E", "F"]], + "D": [["A"]], + "E": [["A"]], + "F": [["G"]], + "G": [], + } + dependencies = Dependencies(dep_dic, ["A", "G"]) dependencies.remove_implied_extroneous() - partial_deps = [(['A'], 'D'), (['G'], 'F'), (['A'], 'B'), (['A'], 'E')] + partial_deps = [(["A"], "D"), (["G"], "F"), (["A"], "B"), (["A"], "E")] assert_equal_tuple_rels(dependencies.find_partial_deps(), partial_deps) def test_find_closure(): dep_dic = { - 'A': [['B']], 'B': [['E'], ['A', 'D']], 'C': [['E', 'F']], - 'D': [['A']], 'E': [['A']], 'F': [['G']], 'G': []} + "A": [["B"]], + "B": [["E"], ["A", "D"]], + "C": [["E", "F"]], + "D": [["A"]], + "E": [["A"]], + "F": [["G"]], + "G": [], + } dependencies = Dependencies(dep_dic) rels = dependencies.tuple_relations() - clos = {'A', 'B', 'D', 'E'} - assert find_closure(rels, ['A']) == clos + clos = {"A", "B", "D", "E"} + assert find_closure(rels, ["A"]) == clos def test_from_rels(): dep_dic = { - 'A': [['B']], 'B': [['E'], ['A', 'D']], 'C': [['E', 'F']], - 'D': [['A']], 'E': [['A']], 'F': [['G']], 'G': []} + "A": [["B"]], + "B": [["E"], ["A", "D"]], + "C": [["E", "F"]], + "D": [["A"]], + "E": [["A"]], + "F": [["G"]], + "G": [], + } dependencies = Dependencies(dep_dic) rels = dependencies.tuple_relations() dependencies_new = Dependencies.from_rels(rels) @@ -189,15 +320,20 @@ def test_from_rels(): def test_find_trans_deps(): dep_dic = { - 'A': [], 'B': [], 'C': [], 'D': [['F']], - 'E': [['A', 'B', 'C', 'D']], 'F': [['A', 'B']]} - dep = Dependencies(dep_dic, ['A', 'B', 'C']) + "A": [], + "B": [], + "C": [], + "D": [["F"]], + "E": [["A", "B", "C", "D"]], + "F": [["A", "B"]], + } + dep = Dependencies(dep_dic, ["A", "B", "C"]) dep.remove_implied_extroneous() - assert dep.find_trans_deps() == [(['F'], 'D')] + assert dep.find_trans_deps() == [(["F"], "D")] def test_equi_atttrs(): - dep_dic = {'A': [['B']], 'B': [['A']], 'C': [], 'D': [['A']]} - dep = Dependencies(dep_dic, ['A', 'C']) - assert dep.equiv_attrs('A', 'B') - assert not dep.equiv_attrs('A', 'D') + dep_dic = {"A": [["B"]], "B": [["A"]], "C": [], "D": [["A"]]} + dep = Dependencies(dep_dic, ["A", "C"]) + assert dep.equiv_attrs("A", "B") + assert not dep.equiv_attrs("A", "D") diff --git a/autonormalize/tests/test_dfd.py b/autonormalize/tests/test_dfd.py index 1f1e832..c94c31f 100644 --- a/autonormalize/tests/test_dfd.py +++ b/autonormalize/tests/test_dfd.py @@ -11,13 +11,13 @@ "id": [100, 101, 102, 103, 104, 105, 106, 107, 109], "age": [1, 2, 3, 4, 5, 6, 7, 5, 6], "height": [4, 5, 6, 7, 8, 9, 10, 8, 9], - "less_than_5": [1, 1, 1, 1, 0, 0, 0, 0, 0] + "less_than_5": [1, 1, 1, 1, 0, 0, 0, 0, 0], } df_1 = pd.DataFrame(dic_1) # A = index, B = random, C = random, D = random, # E = c != 1, F = b < 10, G = c + d -df_2 = pd.read_csv(os.path.join(path, 'autonormalize/examples/example_3')) +df_2 = pd.read_csv(os.path.join(path, "autonormalize/examples/example_3")) def assert_equal_dependency_dics(dep1, dep2): @@ -40,37 +40,171 @@ def serialization_equal(dic_1, dic_2): def test_dfd(): - dep = {"id": [], "age": [["height"], ["id"]], "height": [["age"], ["id"]], - "less_than_5": [["age"], ["height"], ["id"]]} + dep = { + "id": [], + "age": [["height"], ["id"]], + "height": [["age"], ["id"]], + "less_than_5": [["age"], ["height"], ["id"]], + } assert_equal_dependency_dics(dfd.dfd(df_1, 0.98).serialize(), dep) - dep = {"A": [], "B": [["A"]], "C": [["D", "G"], ["A"]], "D": [["C", "G"], ["A"]], - "E": [["C"], ["D", "G"], ["A"]], "F": [["B"], ["A"]], "G": [["C", "D"], ["A"]]} + dep = { + "A": [], + "B": [["A"]], + "C": [["D", "G"], ["A"]], + "D": [["C", "G"], ["A"]], + "E": [["C"], ["D", "G"], ["A"]], + "F": [["B"], ["A"]], + "G": [["C", "D"], ["A"]], + } assert_equal_dependency_dics(dfd.dfd(df_2, 0.98).serialize(), dep) def test_compute_partitions(): - mask = Masks(['a', 'b', 'c']) - a = [6, 2, 3, 7, 8, 1, 0, 2, 0, 3, 6, 0, 4, 6, 8, 7, 6, 8, 1, 5, 1, 3, 3, 0, 0, 4, 5, 5, 7, 0, 8, 2, 4, 7, 0, 0, 6, 4, 6, 8] + mask = Masks(["a", "b", "c"]) + a = [ + 6, + 2, + 3, + 7, + 8, + 1, + 0, + 2, + 0, + 3, + 6, + 0, + 4, + 6, + 8, + 7, + 6, + 8, + 1, + 5, + 1, + 3, + 3, + 0, + 0, + 4, + 5, + 5, + 7, + 0, + 8, + 2, + 4, + 7, + 0, + 0, + 6, + 4, + 6, + 8, + ] # b = [int(x%2 == 0) for x in a] - b = [1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1] + b = [ + 1, + 1, + 0, + 0, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 1, + 1, + 0, + 1, + 1, + 0, + 0, + 0, + 0, + 0, + 1, + 1, + 1, + 0, + 0, + 0, + 1, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 1, + 1, + 1, + ] # c = [(a[i] + b[i])<4 for i in range(40)] - c = [False, True, True, False, False, True, True, True, True, True, False, True, False, False, False, False, False, False, True, False, True, True, True, True, True, False, False, False, False, True, False, True, False, False, True, True, False, False, False, False] - df = pd.DataFrame({'a': a, 'b': b, 'c': c}) - assert dfd.compute_partitions(df, 'c', frozenset(['a', 'b']), {}, 1.00, mask) - assert dfd.compute_partitions(df, 'c', frozenset(['a', 'b']), {}, 0.90, mask) - - assert not dfd.compute_partitions(df, 'a', frozenset(['c']), {}, 1.00, mask) - assert not dfd.compute_partitions(df, 'a', frozenset(['c']), {}, 0.90, mask) + c = [ + False, + True, + True, + False, + False, + True, + True, + True, + True, + True, + False, + True, + False, + False, + False, + False, + False, + False, + True, + False, + True, + True, + True, + True, + True, + False, + False, + False, + False, + True, + False, + True, + False, + False, + True, + True, + False, + False, + False, + False, + ] + df = pd.DataFrame({"a": a, "b": b, "c": c}) + assert dfd.compute_partitions(df, "c", frozenset(["a", "b"]), {}, 1.00, mask) + assert dfd.compute_partitions(df, "c", frozenset(["a", "b"]), {}, 0.90, mask) + + assert not dfd.compute_partitions(df, "a", frozenset(["c"]), {}, 1.00, mask) + assert not dfd.compute_partitions(df, "a", frozenset(["c"]), {}, 0.90, mask) c[0] = True - df = pd.DataFrame({'a': a, 'b': b, 'c': c}) - assert dfd.compute_partitions(df, 'c', frozenset(['a', 'b']), {}, 0.97, mask) - assert not dfd.compute_partitions(df, 'c', frozenset(['a', 'b']), {}, 0.98, mask) + df = pd.DataFrame({"a": a, "b": b, "c": c}) + assert dfd.compute_partitions(df, "c", frozenset(["a", "b"]), {}, 0.97, mask) + assert not dfd.compute_partitions(df, "c", frozenset(["a", "b"]), {}, 0.98, mask) c[35] = False - df = pd.DataFrame({'a': a, 'b': b, 'c': c}) - assert dfd.compute_partitions(df, 'c', frozenset(['a', 'b']), {}, 0.95, mask) - assert not dfd.compute_partitions(df, 'c', frozenset(['a', 'b']), {}, 0.96, mask) + df = pd.DataFrame({"a": a, "b": b, "c": c}) + assert dfd.compute_partitions(df, "c", frozenset(["a", "b"]), {}, 0.95, mask) + assert not dfd.compute_partitions(df, "c", frozenset(["a", "b"]), {}, 0.96, mask) # def test_approximate_dependencies(): diff --git a/autonormalize/tests/test_example.py b/autonormalize/tests/test_example.py index ac42a63..164e091 100644 --- a/autonormalize/tests/test_example.py +++ b/autonormalize/tests/test_example.py @@ -1,29 +1,46 @@ -import featuretools as ft -import pandas as pd from unittest.mock import patch +import featuretools as ft +import pandas as pd import pytest + import autonormalize as an def test_ft_mock_customer(): - df = ft.demo.load_mock_customer(n_customers=80, n_products=50, n_sessions=200, - n_transactions=10000, return_single_table=True) + df = ft.demo.load_mock_customer( + n_customers=80, + n_products=50, + n_sessions=200, + n_transactions=10000, + return_single_table=True, + ) - entityset = an.auto_entityset(df, name="Customer Transactions", time_index='transaction_time') + entityset = an.auto_entityset( + df, name="Customer Transactions", time_index="transaction_time" + ) - assert set(entityset['transaction_id'].columns) == set(['transaction_id', 'session_id', 'transaction_time', - 'product_id', 'amount']) + assert set(entityset["transaction_id"].columns) == set( + ["transaction_id", "session_id", "transaction_time", "product_id", "amount"] + ) - assert set(entityset['product_id'].columns) == set(['product_id', 'brand']) + assert set(entityset["product_id"].columns) == set(["product_id", "brand"]) - assert set(entityset['session_id'].columns) == set(['session_id', 'customer_id', 'device', 'session_start']) + assert set(entityset["session_id"].columns) == set( + ["session_id", "customer_id", "device", "session_start"] + ) - assert set(entityset['customer_id'].columns) == set(['customer_id', 'zip_code', 'join_date', 'birthday']) + assert set(entityset["customer_id"].columns) == set( + ["customer_id", "zip_code", "join_date", "birthday"] + ) - assert set([str(rel) for rel in entityset.relationships]) == set([' session_id.session_id>', - ' product_id.product_id>', - ' customer_id.customer_id>']) + assert set([str(rel) for rel in entityset.relationships]) == set( + [ + " session_id.session_id>", + " product_id.product_id>", + " customer_id.customer_id>", + ] + ) @patch("autonormalize.autonormalize.auto_entityset") @@ -44,7 +61,13 @@ def test_normalize_entityset(auto_entityset): an.normalize_entityset(es, accuracy) - auto_entityset.assert_called_with(df_out, accuracy, index=df_out.ww.index, name=es.id, time_index=df_out.ww.time_index) + auto_entityset.assert_called_with( + df_out, + accuracy, + index=df_out.ww.index, + name=es.id, + time_index=df_out.ww.time_index, + ) es.add_dataframe(df2, "df2") diff --git a/autonormalize/tests/test_normalize.py b/autonormalize/tests/test_normalize.py index 37a717b..54bcc4b 100644 --- a/autonormalize/tests/test_normalize.py +++ b/autonormalize/tests/test_normalize.py @@ -9,10 +9,15 @@ def test_normalize(): # check that there are no new relations? # there can be less however? dep_dic = { - 'A': [], 'B': [], 'C': [], 'D': [['F']], - 'E': [['A', 'B', 'C', 'D']], 'F': [['A', 'B']]} - dep = classes.Dependencies(dep_dic, ['A', 'B', 'C']) - df = pd.DataFrame(columns=['A', 'B', 'C', 'D', 'E', 'F'], dtype='int64') + "A": [], + "B": [], + "C": [], + "D": [["F"]], + "E": [["A", "B", "C", "D"]], + "F": [["A", "B"]], + } + dep = classes.Dependencies(dep_dic, ["A", "B", "C"]) + df = pd.DataFrame(columns=["A", "B", "C", "D", "E", "F"], dtype="int64") new = normalize.normalize(dep, df) dep_dic = dep.serialize() for x in new: @@ -29,88 +34,143 @@ def test_normalize(): def test_find_most_comm(): - deps = classes.Dependencies({}, ['d']) - rels = [(['a'], 'b'), (['b'], 'c'), (['b'], 'a'), - (['d'], 'a')] - assert normalize.find_most_comm(rels, deps) == ['b'] - rels = [(['a', 'c'], 'b'), (['b'], 'c'), (['b'], 'a'), - (['d'], 'a'), (['a', 'c'], 'b')] - assert normalize.find_most_comm(rels, deps) == ['b'] + deps = classes.Dependencies({}, ["d"]) + rels = [(["a"], "b"), (["b"], "c"), (["b"], "a"), (["d"], "a")] + assert normalize.find_most_comm(rels, deps) == ["b"] + rels = [ + (["a", "c"], "b"), + (["b"], "c"), + (["b"], "a"), + (["d"], "a"), + (["a", "c"], "b"), + ] + assert normalize.find_most_comm(rels, deps) == ["b"] def test_split_on_dep(): - dep_dic = {'A': [], 'B': [], 'C': [['A'], ['B']], 'D': [['B']]} - new = normalize.split_on_dep(['B'], classes.Dependencies(dep_dic)) - assert new[0].serialize() == {'A': [], 'B': []} - assert new[1].serialize() == {'B': [], 'C': [['B']], 'D': [['B']]} + dep_dic = {"A": [], "B": [], "C": [["A"], ["B"]], "D": [["B"]]} + new = normalize.split_on_dep(["B"], classes.Dependencies(dep_dic)) + assert new[0].serialize() == {"A": [], "B": []} + assert new[1].serialize() == {"B": [], "C": [["B"]], "D": [["B"]]} def test_drop_primary_dups(): - df_dic = {"city": ['honolulu', 'boston', 'honolulu', 'dallas', 'seattle', 'honolulu', 'boston', 'honolulu', 'seattle', 'boston'], - "state": ['HI', 'MA', 'HI', 'TX', 'WA', 'AL', 'MA', 'HI', 'WA', 'NA'], - "is_liberal": [True, True, True, False, True, True, True, True, True, False]} + df_dic = { + "city": [ + "honolulu", + "boston", + "honolulu", + "dallas", + "seattle", + "honolulu", + "boston", + "honolulu", + "seattle", + "boston", + ], + "state": ["HI", "MA", "HI", "TX", "WA", "AL", "MA", "HI", "WA", "NA"], + "is_liberal": [True, True, True, False, True, True, True, True, True, False], + } df = pd.DataFrame(df_dic) - new_df = normalize.drop_primary_dups(df, ['city']) + new_df = normalize.drop_primary_dups(df, ["city"]) - df_new_dic = {"city": ["boston", "dallas", "honolulu", "seattle"], - "state": ["MA", "TX", "HI", "WA"], - "is_liberal": [True, False, True, True]} + df_new_dic = { + "city": ["boston", "dallas", "honolulu", "seattle"], + "state": ["MA", "TX", "HI", "WA"], + "is_liberal": [True, False, True, True], + } assert_frame_equal(pd.DataFrame(df_new_dic), new_df) - df = pd.DataFrame([[True, True, True], [True, True, True], [False, True, False], - [True, False, False], [True, False, False], [False, True, False], [True, False, True]], - columns=["requires_light", "is_dark", "light_on"]) - - new_df = normalize.drop_primary_dups(df, ['requires_light', 'is_dark']) + df = pd.DataFrame( + [ + [True, True, True], + [True, True, True], + [False, True, False], + [True, False, False], + [True, False, False], + [False, True, False], + [True, False, True], + ], + columns=["requires_light", "is_dark", "light_on"], + ) + + new_df = normalize.drop_primary_dups(df, ["requires_light", "is_dark"]) # compare_df = pd.DataFrame([[True, False, False], [False, True, False], [True, True, True]], # columns=["requires_light", "is_dark", "light_on"]) # compare_df = compare_df.sort_values(by=["requires_light", "is_dark"]).reset_index(drop=True) for index, row in new_df.iterrows(): - if row['requires_light'] and not row['is_dark']: - assert not row['light_on'] - if not row['requires_light'] and row['is_dark']: - assert not row['light_on'] - if row['requires_light'] and row['is_dark']: - assert row['light_on'] + if row["requires_light"] and not row["is_dark"]: + assert not row["light_on"] + if not row["requires_light"] and row["is_dark"]: + assert not row["light_on"] + if row["requires_light"] and row["is_dark"]: + assert row["light_on"] def test_filter(): - keys = [(['A'], 'E'), (['A', 'B'], 'E'), (['C', 'D'], 'E')] - df = pd.DataFrame(columns=['A', 'B', 'C', 'D']) - df = df.astype({'A': 'float64', 'B': 'int64', 'C': 'category', 'D': 'object'}) + keys = [(["A"], "E"), (["A", "B"], "E"), (["C", "D"], "E")] + df = pd.DataFrame(columns=["A", "B", "C", "D"]) + df = df.astype({"A": "float64", "B": "int64", "C": "category", "D": "object"}) normalize.filter(keys, df) - assert keys == [(['C', 'D'], 'E')] + assert keys == [(["C", "D"], "E")] def test_choose_index(): - keys = [['A'], ['A_id'], ['B']] - df = pd.DataFrame(columns=['A', 'B', 'C', 'D']) - assert normalize.choose_index(keys, df) == ['A_id'] + keys = [["A"], ["A_id"], ["B"]] + df = pd.DataFrame(columns=["A", "B", "C", "D"]) + assert normalize.choose_index(keys, df) == ["A_id"] - keys = [['B'], ['C'], ['A']] - assert normalize.choose_index(keys, df) == ['A'] + keys = [["B"], ["C"], ["A"]] + assert normalize.choose_index(keys, df) == ["A"] - keys = [['A', 'C'], ['A', 'B']] - assert normalize.choose_index(keys, df) == ['A', 'B'] + keys = [["A", "C"], ["A", "B"]] + assert normalize.choose_index(keys, df) == ["A", "B"] def test_normalize_dataframe(): - dic = {'team': ['Red', 'Red', 'Red', 'Orange', 'Orange', 'Yellow', - 'Yellow', 'Green', 'Green', 'Blue'], - 'jersey_num': [1, 2, 3, 1, 2, 1, 5, 8, 2, 2], - 'player_name': ['A', 'B', 'C', 'D', 'A', 'E', 'B', 'A', 'G', 'H'], - 'city': ['boston', 'boston', 'boston', 'chicago', 'chicago', - 'honolulu', 'honolulu', 'boston', 'boston', 'austin'], - 'state': ['MA', 'MA', 'MA', 'IL', 'IL', 'HI', 'HI', 'MA', 'MA', 'TX']} + dic = { + "team": [ + "Red", + "Red", + "Red", + "Orange", + "Orange", + "Yellow", + "Yellow", + "Green", + "Green", + "Blue", + ], + "jersey_num": [1, 2, 3, 1, 2, 1, 5, 8, 2, 2], + "player_name": ["A", "B", "C", "D", "A", "E", "B", "A", "G", "H"], + "city": [ + "boston", + "boston", + "boston", + "chicago", + "chicago", + "honolulu", + "honolulu", + "boston", + "boston", + "austin", + ], + "state": ["MA", "MA", "MA", "IL", "IL", "HI", "HI", "MA", "MA", "TX"], + } df = pd.DataFrame(dic) - deps = classes.Dependencies({'team': [['player_name', 'jersey_num']], - 'jersey_num': [['player_name', 'team']], - 'player_name': [['team', 'jersey_num']], - 'city': [['team'], ['state'], ['player_name', 'jersey_num']], - 'state': [['team'], ['player_name', 'jersey_num'], ['city']]}, ['team', 'jersey_num']) + deps = classes.Dependencies( + { + "team": [["player_name", "jersey_num"]], + "jersey_num": [["player_name", "team"]], + "player_name": [["team", "jersey_num"]], + "city": [["team"], ["state"], ["player_name", "jersey_num"]], + "state": [["team"], ["player_name", "jersey_num"], ["city"]], + }, + ["team", "jersey_num"], + ) depdf = normalize.DepDF(deps, df, deps.get_prim_key()) normalize.normalize_dataframe(depdf) @@ -118,63 +178,91 @@ def test_normalize_dataframe(): assert len(new_dfs) == 3 - dic_one = {'team': ['Red', 'Red', 'Red', 'Orange', 'Orange', 'Yellow', - 'Yellow', 'Green', 'Green', 'Blue'], - 'jersey_num': [1, 2, 3, 1, 2, 1, 5, 8, 2, 2], - 'player_name': ['A', 'B', 'C', 'D', 'A', 'E', 'B', 'A', 'G', 'H']} - - dic_two = {'team': ['Red', 'Orange', 'Yellow', 'Green', 'Blue', 'Blue'], - 'city': ['boston', 'chicago', 'honolulu', 'boston', 'austin', 'austin']} - - dic_three = {'city': ['boston', 'chicago', 'honolulu', 'austin', 'austin'], - 'state': ['MA', 'IL', 'HI', 'TX', 'TX']} - - assert new_dfs[0].equals(normalize.drop_primary_dups(pd.DataFrame(dic_one), ['team', 'jersey_num'])) - assert new_dfs[1].equals(normalize.drop_primary_dups(pd.DataFrame(dic_two), ['team'])) - assert new_dfs[2].equals(normalize.drop_primary_dups(pd.DataFrame(dic_three), ['city'])) + dic_one = { + "team": [ + "Red", + "Red", + "Red", + "Orange", + "Orange", + "Yellow", + "Yellow", + "Green", + "Green", + "Blue", + ], + "jersey_num": [1, 2, 3, 1, 2, 1, 5, 8, 2, 2], + "player_name": ["A", "B", "C", "D", "A", "E", "B", "A", "G", "H"], + } + + dic_two = { + "team": ["Red", "Orange", "Yellow", "Green", "Blue", "Blue"], + "city": ["boston", "chicago", "honolulu", "boston", "austin", "austin"], + } + + dic_three = { + "city": ["boston", "chicago", "honolulu", "austin", "austin"], + "state": ["MA", "IL", "HI", "TX", "TX"], + } + + assert new_dfs[0].equals( + normalize.drop_primary_dups(pd.DataFrame(dic_one), ["team", "jersey_num"]) + ) + assert new_dfs[1].equals( + normalize.drop_primary_dups(pd.DataFrame(dic_two), ["team"]) + ) + assert new_dfs[2].equals( + normalize.drop_primary_dups(pd.DataFrame(dic_three), ["city"]) + ) def test_make_indexes(): - dic = {"id": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], - "month": ['dec', 'dec', 'jul', 'jul', 'dec', 'jul', 'jul', - 'jul', 'dec', 'jul'], - "hemisphere": ['N', 'N', 'N', 'N', 'S', 'S', 'S', 'S', 'S', 'N'], - "is_winter": [True, True, False, False, False, True, True, True, False, False]} + dic = { + "id": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], + "month": ["dec", "dec", "jul", "jul", "dec", "jul", "jul", "jul", "dec", "jul"], + "hemisphere": ["N", "N", "N", "N", "S", "S", "S", "S", "S", "N"], + "is_winter": [True, True, False, False, False, True, True, True, False, False], + } df = pd.DataFrame(dic) - deps = classes.Dependencies({'id': [], - 'month': [['id'], ['hemisphere', 'is_winter']], - 'hemisphere': [['month', 'is_winter'], ['id']], - 'is_winter': [['month', 'hemisphere'], ['id']]}, ['id']) + deps = classes.Dependencies( + { + "id": [], + "month": [["id"], ["hemisphere", "is_winter"]], + "hemisphere": [["month", "is_winter"], ["id"]], + "is_winter": [["month", "hemisphere"], ["id"]], + }, + ["id"], + ) depdf = normalize.DepDF(deps, df, deps.get_prim_key()) normalize.normalize_dataframe(depdf) normalize.make_indexes(depdf) new_dfs = depdf.return_dfs() - mask = (new_dfs[1]['month'] == 'dec') & (new_dfs[1]['hemisphere'] == 'N') + mask = (new_dfs[1]["month"] == "dec") & (new_dfs[1]["hemisphere"] == "N") val = new_dfs[1][mask][new_dfs[1].columns[0]].iloc[0] assert new_dfs[0][new_dfs[1].columns[0]][0] == val assert new_dfs[0][new_dfs[1].columns[0]][1] == val - mask = (new_dfs[1]['month'] == 'jul') & (new_dfs[1]['hemisphere'] == 'N') + mask = (new_dfs[1]["month"] == "jul") & (new_dfs[1]["hemisphere"] == "N") val = new_dfs[1][mask][new_dfs[1].columns[0]].iloc[0] assert new_dfs[0][new_dfs[1].columns[0]][2] == val assert new_dfs[0][new_dfs[1].columns[0]][3] == val assert new_dfs[0][new_dfs[1].columns[0]][9] == val - mask = (new_dfs[1]['month'] == 'dec') & (new_dfs[1]['hemisphere'] == 'S') + mask = (new_dfs[1]["month"] == "dec") & (new_dfs[1]["hemisphere"] == "S") val = new_dfs[1][mask][new_dfs[1].columns[0]].iloc[0] assert new_dfs[0][new_dfs[1].columns[0]][4] == val assert new_dfs[0][new_dfs[1].columns[0]][8] == val - mask = (new_dfs[1]['month'] == 'jul') & (new_dfs[1]['hemisphere'] == 'S') + mask = (new_dfs[1]["month"] == "jul") & (new_dfs[1]["hemisphere"] == "S") val = new_dfs[1][mask][new_dfs[1].columns[0]].iloc[0] assert new_dfs[0][new_dfs[1].columns[0]][5] == val assert new_dfs[0][new_dfs[1].columns[0]][6] == val assert new_dfs[0][new_dfs[1].columns[0]][7] == val # Make sure new column names are sorted - assert 'hemisphere_month' in new_dfs[0].columns - assert 'hemisphere_month' in new_dfs[1].columns + assert "hemisphere_month" in new_dfs[0].columns + assert "hemisphere_month" in new_dfs[1].columns diff --git a/docs/source/conf.py b/docs/source/conf.py index eb0b09b..74c93a9 100755 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -18,20 +18,20 @@ import subprocess import shutil from pathlib import Path -from sphinx.ext.autodoc import (Documenter, MethodDocumenter) +from sphinx.ext.autodoc import Documenter, MethodDocumenter from sphinx.ext.autodoc import MethodDocumenter, Documenter -path = os.path.join('..', '..') +path = os.path.join("..", "..") sys.path.insert(0, os.path.abspath(path)) # -- Project information ----------------------------------------------------- -project = 'AutoNormalize' -copyright = '2021, Alteryx, Inc.' -author = 'Alteryx, Inc.' +project = "AutoNormalize" +copyright = "2021, Alteryx, Inc." +author = "Alteryx, Inc." # The short X.Y version version = autonormalize.__version__ @@ -49,29 +49,29 @@ # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ - 'nbsphinx', - 'sphinx.ext.autodoc', - 'sphinx.ext.autosummary', - 'sphinx.ext.napoleon', - 'sphinx.ext.viewcode', - 'sphinx.ext.extlinks', - 'sphinx.ext.graphviz', - 'sphinx.ext.inheritance_diagram', - 'IPython.sphinxext.ipython_console_highlighting', - 'IPython.sphinxext.ipython_directive', + "nbsphinx", + "sphinx.ext.autodoc", + "sphinx.ext.autosummary", + "sphinx.ext.napoleon", + "sphinx.ext.viewcode", + "sphinx.ext.extlinks", + "sphinx.ext.graphviz", + "sphinx.ext.inheritance_diagram", + "IPython.sphinxext.ipython_console_highlighting", + "IPython.sphinxext.ipython_directive", ] # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # # source_suffix = ['.rst', '.md'] -source_suffix = '.rst' +source_suffix = ".rst" # The main toctree document. -master_doc = 'index' +master_doc = "index" # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. @@ -83,7 +83,7 @@ # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. -exclude_patterns = ['**.ipynb_checkpoints'] +exclude_patterns = ["**.ipynb_checkpoints"] # The name of the Pygments (syntax highlighting) style to use. pygments_style = None @@ -117,7 +117,7 @@ # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +html_static_path = ["_static"] # Custom sidebar templates, must be a dictionary that maps document names # to template names. @@ -133,7 +133,7 @@ # -- Options for HTMLHelp output --------------------------------------------- # Output file base name for HTML help builder. -htmlhelp_basename = 'AutoNormalize' +htmlhelp_basename = "AutoNormalize" # -- Options for LaTeX output ------------------------------------------------ @@ -142,15 +142,12 @@ # The paper size ('letterpaper' or 'a4paper'). # # 'papersize': 'letterpaper', - # The font size ('10pt', '11pt' or '12pt'). # # 'pointsize': '10pt', - # Additional stuff for the LaTeX preamble. # # 'preamble': '', - # Latex figure (float) alignment # # 'figure_align': 'htbp', @@ -160,8 +157,13 @@ # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ - (master_doc, 'AutoNormalize.tex', 'AutoNormalize Documentation', - 'Alteryx, Inc.', 'manual'), + ( + master_doc, + "AutoNormalize.tex", + "AutoNormalize Documentation", + "Alteryx, Inc.", + "manual", + ), ] @@ -169,10 +171,7 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). -man_pages = [ - (master_doc, 'autonormalize', 'AutoNormalize Documentation', - [author], 1) -] +man_pages = [(master_doc, "autonormalize", "AutoNormalize Documentation", [author], 1)] # -- Options for Texinfo output ---------------------------------------------- @@ -181,9 +180,15 @@ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - (master_doc, 'AutoNormalize', 'AutoNormalize Documentation', - author, 'AutoNormalize', 'One line description of project.', - 'Miscellaneous'), + ( + master_doc, + "AutoNormalize", + "AutoNormalize Documentation", + author, + "AutoNormalize", + "One line description of project.", + "Miscellaneous", + ), ] @@ -202,26 +207,35 @@ # epub_uid = '' # A list of files that should not be packed into the epub file. -epub_exclude_files = ['search.html'] +epub_exclude_files = ["search.html"] # -- Extension configuration ------------------------------------------------- extlinks = { - 'issue': ('https://github.com/alteryx/autonormalize/issues/%s', '#'), - 'pr': ('https://github.com/alteryx/autonormalize/pull/%s', '#'), - 'user': ('https://github.com/%s', '@') + "issue": ("https://github.com/alteryx/autonormalize/issues/%s", "#"), + "pr": ("https://github.com/alteryx/autonormalize/pull/%s", "#"), + "user": ("https://github.com/%s", "@"), } autosummary_generate = ["api_reference.rst"] templates_path = ["_templates"] html_show_sphinx = False -nbsphinx_execute = 'always' -nbsphinx_timeout = 600 # sphinx defaults each cell to 30 seconds so we need to override here +nbsphinx_execute = "always" +nbsphinx_timeout = ( + 600 # sphinx defaults each cell to 30 seconds so we need to override here +) + +inheritance_graph_attrs = dict( + rankdir="LR", + size='"1000, 333"', + fontsize=30, + labelfontsize=30, + ratio="compress", + dpi=960, +) -inheritance_graph_attrs = dict(rankdir="LR", size='"1000, 333"', - fontsize=30, labelfontsize=30, ratio='compress', dpi=960) class AccessorLevelDocumenter(Documenter): """ @@ -230,10 +244,11 @@ class AccessorLevelDocumenter(Documenter): Referenced pandas-sphinx-theme (https://github.com/pandas-dev/pandas-sphinx-theme) and sphinx-doc (https://github.com/sphinx-doc/sphinx/blob/8c7faed6fcbc6b7d40f497698cb80fc10aee1ab3/sphinx/ext/autodoc/__init__.py#L846) """ + def resolve_name(self, modname, parents, path, base): - modname = 'autonormalize' - mod_cls = path.rstrip('.') - mod_cls = mod_cls.split('.') + modname = "autonormalize" + mod_cls = path.rstrip(".") + mod_cls = mod_cls.split(".") return modname, mod_cls + [base] @@ -254,20 +269,24 @@ def format_name(self): class AccessorMethodDocumenter(AccessorLevelDocumenter, MethodDocumenter): - objtype = 'accessormethod' - directivetype = 'method' + objtype = "accessormethod" + directivetype = "method" # lower than MethodDocumenter so this is not chosen for normal methods priority = 0.6 def setup(app): - home_dir = os.environ.get('HOME', '/') + home_dir = os.environ.get("HOME", "/") ipython_p = Path(home_dir + "/.ipython/profile_default/startup") ipython_p.mkdir(parents=True, exist_ok=True) file_p = os.path.abspath(os.path.dirname(__file__)) - shutil.copy(file_p + "/set-headers.py", home_dir + "/.ipython/profile_default/startup") - app.add_js_file('https://cdnjs.cloudflare.com/ajax/libs/require.js/2.1.10/require.min.js') + shutil.copy( + file_p + "/set-headers.py", home_dir + "/.ipython/profile_default/startup" + ) + app.add_js_file( + "https://cdnjs.cloudflare.com/ajax/libs/require.js/2.1.10/require.min.js" + ) app.add_css_file("style.css") app.add_autodocumenter(AccessorCallableDocumenter) - app.add_autodocumenter(AccessorMethodDocumenter) \ No newline at end of file + app.add_autodocumenter(AccessorMethodDocumenter) diff --git a/docs/source/guides/demo/food/__init__.py b/docs/source/guides/demo/food/__init__.py index 4628b38..7147dea 100644 --- a/docs/source/guides/demo/food/__init__.py +++ b/docs/source/guides/demo/food/__init__.py @@ -2,9 +2,10 @@ from pandas import read_csv from os.path import join -PWD = join(PWD, 'food') +PWD = join(PWD, "food") + def load_sample(): food_df = read_csv(join(PWD, "FAO.csv"), encoding="latin1") food_df = food_df.drop(columns=food_df.columns[10:]) - return food_df \ No newline at end of file + return food_df diff --git a/docs/source/guides/demo/liquor/__init__.py b/docs/source/guides/demo/liquor/__init__.py index 6f6aefc..840ab5f 100644 --- a/docs/source/guides/demo/liquor/__init__.py +++ b/docs/source/guides/demo/liquor/__init__.py @@ -2,7 +2,8 @@ from pandas import read_csv from os.path import join -PWD = join(PWD, 'liquor') +PWD = join(PWD, "liquor") + def load_sample(): df = read_csv(join(PWD, "Iowa_Liquor_Sales.csv")) @@ -12,4 +13,4 @@ def load_sample(): # df = df.dropna() # df = df.drop_duplicates() # df = df.head(1000) - return df \ No newline at end of file + return df diff --git a/docs/source/guides/editing_dependencies.ipynb b/docs/source/guides/editing_dependencies.ipynb index 6c74462..75b10c6 100644 --- a/docs/source/guides/editing_dependencies.ipynb +++ b/docs/source/guides/editing_dependencies.ipynb @@ -121,13 +121,15 @@ } ], "source": [ - "rows = [['tigers', 'boston', 'MA', 20],\n", - " ['elephants', 'chicago', 'IL', 21],\n", - " ['foxes', 'miami', 'FL', 20],\n", - " ['snakes', 'austin', 'TX', 20],\n", - " ['dolphins', 'honolulu', 'HI', 19],\n", - " ['eagles', 'houston', 'TX', 21]]\n", - "df = pd.DataFrame(rows, columns=['team', 'city', 'state', 'roster_size'])\n", + "rows = [\n", + " [\"tigers\", \"boston\", \"MA\", 20],\n", + " [\"elephants\", \"chicago\", \"IL\", 21],\n", + " [\"foxes\", \"miami\", \"FL\", 20],\n", + " [\"snakes\", \"austin\", \"TX\", 20],\n", + " [\"dolphins\", \"honolulu\", \"HI\", 19],\n", + " [\"eagles\", \"houston\", \"TX\", 21],\n", + "]\n", + "df = pd.DataFrame(rows, columns=[\"team\", \"city\", \"state\", \"roster_size\"])\n", "df" ] }, @@ -197,8 +199,8 @@ } ], "source": [ - "deps.remove_dep('team', ['city'])\n", - "deps.remove_dep('roster_size', ['city'])\n", + "deps.remove_dep(\"team\", [\"city\"])\n", + "deps.remove_dep(\"roster_size\", [\"city\"])\n", "print(deps)" ] }, diff --git a/docs/source/guides/kaggle_food_dataset.ipynb b/docs/source/guides/kaggle_food_dataset.ipynb index d250fca..2ea1363 100644 --- a/docs/source/guides/kaggle_food_dataset.ipynb +++ b/docs/source/guides/kaggle_food_dataset.ipynb @@ -184,7 +184,7 @@ } ], "source": [ - "entityset = an.auto_entityset(food_df, name='Foods')" + "entityset = an.auto_entityset(food_df, name=\"Foods\")" ] }, { @@ -265,7 +265,10 @@ ], "source": [ "import featuretools as ft\n", - "fm, features = ft.dfs(entityset=entityset, target_dataframe_name='Area Code_Element Code_Item Code')\n", + "\n", + "fm, features = ft.dfs(\n", + " entityset=entityset, target_dataframe_name=\"Area Code_Element Code_Item Code\"\n", + ")\n", "features" ] }, diff --git a/docs/source/guides/kaggle_liquor_sales_dataset.ipynb b/docs/source/guides/kaggle_liquor_sales_dataset.ipynb index ed168e5..327a4d6 100644 --- a/docs/source/guides/kaggle_liquor_sales_dataset.ipynb +++ b/docs/source/guides/kaggle_liquor_sales_dataset.ipynb @@ -158,7 +158,7 @@ ], "source": [ "df = load_sample()\n", - "print(\"Rows: \"+ str(df.shape[0]))\n", + "print(\"Rows: \" + str(df.shape[0]))\n", "print(\"Columns: \" + str(df.shape[1]))\n", "df.head(3)" ] @@ -194,7 +194,7 @@ } ], "source": [ - "df.dtypes\n" + "df.dtypes" ] }, { @@ -203,7 +203,7 @@ "metadata": {}, "outputs": [], "source": [ - "df = df.astype({\"County Number\": 'int64', \"Category\": 'int64'})" + "df = df.astype({\"County Number\": \"int64\", \"Category\": \"int64\"})" ] }, { @@ -238,7 +238,7 @@ ], "source": [ "start = time.time()\n", - "entityset = an.auto_entityset(df, accuracy=0.96, name='liquor orders')\n", + "entityset = an.auto_entityset(df, accuracy=0.96, name=\"liquor orders\")\n", "time.time() - start" ] }, From 8da35b4ab459abf1bed775f0ff257f4d51cc57c5 Mon Sep 17 00:00:00 2001 From: Gaurav Sheni Date: Thu, 15 Dec 2022 11:02:59 -0500 Subject: [PATCH 04/11] update coverage --- .../workflows/unit_tests_with_latest_deps.yml | 37 +++++++++---------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/.github/workflows/unit_tests_with_latest_deps.yml b/.github/workflows/unit_tests_with_latest_deps.yml index b435cdd..d3184d4 100644 --- a/.github/workflows/unit_tests_with_latest_deps.yml +++ b/.github/workflows/unit_tests_with_latest_deps.yml @@ -1,51 +1,50 @@ +name: Unit Tests - Latest Dependencies on: pull_request: types: [opened, synchronize] push: branches: - main - -name: Unit Tests - Latest Dependencies +env: + PYARROW_IGNORE_TIMEZONE: 1 + ALTERYX_OPEN_SRC_UPDATE_CHECKER: False jobs: unit_tests: name: Unit Tests - Python ${{ matrix.python-version }} runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.7", "3.8", "3.9"] + python-version: ["3.8", "3.9", "3.10", "3.11"] steps: - name: Set up python ${{ matrix.python_version }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v3 with: python-version: ${{ matrix.python_version }} - name: Checkout repository - uses: actions/checkout@v2 + uses: actions/checkout@v4 with: ref: ${{ github.event.pull_request.head.ref }} repository: ${{ github.event.pull_request.head.repo.full_name }} - name: Build source distribution - run: make package_autonormalize + run: make package - name: Install package with test requirements run: | sudo python -m pip config --site set global.progress_bar off python -m pip install --upgrade pip - sudo apt update && sudo apt install -y graphviz - python -m pip install -e unpacked_sdist/ - python -m pip install -r unpacked_sdist/test-requirements.txt - - if: ${{ matrix.python_version == 3.7 }} + sudo apt update + sudo apt install -y graphviz + python -m pip install -e unpacked_sdist/[test] + - if: ${{ matrix.python_version == 3.8 }} name: Run unit tests with code coverage run: | - python -m pip install "$(cat dev-requirements.txt | grep codecov)" - coverage erase cd unpacked_sdist/ coverage erase - pytest autonormalize/ -n 2 --cov=autonormalize --cov-config=../.coveragerc - env: - PYARROW_IGNORE_TIMEZONE: 1 - ALTERYX_OPEN_SRC_UPDATE_CHECKER: False - - if: ${{ matrix.python_version == 3.7 }} + pytest autonormalize/ -n auto --cov=autonormalize --cov-config=../pyproject.toml --cov-report=xml:../coverage.xml + - if: ${{ matrix.python_version == 3.8 }} name: Upload coverage to Codecov - uses: codecov/codecov-action@v1 + uses: codecov/codecov-action@v3 with: - fail_ci_if_error: true token: ${{ secrets.CODECOV_TOKEN }} + fail_ci_if_error: true + files: ${{ github.workspace }}/coverage.xml + verbose: true \ No newline at end of file From 43d605d32b73884e4076ecbcbc565c5106f01b9b Mon Sep 17 00:00:00 2001 From: Gaurav Sheni Date: Thu, 15 Dec 2022 11:04:44 -0500 Subject: [PATCH 05/11] update owrkflows --- .github/workflows/build_docs.yml | 10 +++++----- .github/workflows/entry_point_test.yml | 6 +++--- .github/workflows/lint_check.yml | 4 ++-- .github/workflows/release.yml | 2 +- .github/workflows/release_notes_updated.yml | 2 +- .github/workflows/unit_tests_with_latest_deps.yml | 4 ++-- 6 files changed, 14 insertions(+), 14 deletions(-) diff --git a/.github/workflows/build_docs.yml b/.github/workflows/build_docs.yml index 8228fe2..60cdcec 100644 --- a/.github/workflows/build_docs.yml +++ b/.github/workflows/build_docs.yml @@ -8,15 +8,15 @@ on: name: Build Docs jobs: build_docs: - name: 3.7 build docs + name: 3.8 build docs runs-on: ubuntu-latest steps: - - name: Set up python 3.7 - uses: actions/setup-python@v2 + - name: Set up python 3.8 + uses: actions/setup-python@v4 with: - python-version: 3.7 + python-version: 3.8 - name: Checkout repository - uses: actions/checkout@v2 + uses: actions/checkout@v3 with: ref: ${{ github.event.pull_request.head.ref }} repository: ${{ github.event.pull_request.head.repo.full_name }} diff --git a/.github/workflows/entry_point_test.yml b/.github/workflows/entry_point_test.yml index ce2289d..4a81264 100644 --- a/.github/workflows/entry_point_test.yml +++ b/.github/workflows/entry_point_test.yml @@ -9,14 +9,14 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python_version: ["3.7", "3.8", "3.9"] + python_version: ["3.8", "3.9", "3.10", "3.11"] steps: - name: Set up python ${{ matrix.python_version }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: ${{ matrix.python_version }} - name: Checkout repository - uses: actions/checkout@v2 + uses: actions/checkout@v3 with: ref: ${{ github.event.pull_request.head.ref }} repository: ${{ github.event.pull_request.head.repo.full_name }} diff --git a/.github/workflows/lint_check.yml b/.github/workflows/lint_check.yml index 72a6183..7f7d22a 100644 --- a/.github/workflows/lint_check.yml +++ b/.github/workflows/lint_check.yml @@ -15,11 +15,11 @@ jobs: python_version: ["3.11"] steps: - name: Set up python ${{ matrix.python_version }} - uses: actions/setup-python@v3 + uses: actions/setup-python@v4 with: python-version: ${{ matrix.python_version }} - name: Checkout repository - uses: actions/checkout@v4 + uses: actions/checkout@v3 with: ref: ${{ github.event.pull_request.head.ref }} repository: ${{ github.event.pull_request.head.repo.full_name }} diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index df973bd..fe93bc4 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -8,7 +8,7 @@ jobs: name: PyPI Release runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: PyPI Upload uses: FeatureLabs/gh-action-pypi-upload@v1 env: diff --git a/.github/workflows/release_notes_updated.yml b/.github/workflows/release_notes_updated.yml index b2aadeb..01bb579 100644 --- a/.github/workflows/release_notes_updated.yml +++ b/.github/workflows/release_notes_updated.yml @@ -26,7 +26,7 @@ jobs: print('::set-output name=is_dev::' + str(is_dev)) - if: ${{ steps.branch.outputs.is_dev == 'True' }} name: Checkout repository - uses: actions/checkout@v2 + uses: actions/checkout@v3 with: ref: ${{ github.event.pull_request.head.ref }} repository: ${{ github.event.pull_request.head.repo.full_name }} diff --git a/.github/workflows/unit_tests_with_latest_deps.yml b/.github/workflows/unit_tests_with_latest_deps.yml index d3184d4..a61bece 100644 --- a/.github/workflows/unit_tests_with_latest_deps.yml +++ b/.github/workflows/unit_tests_with_latest_deps.yml @@ -17,11 +17,11 @@ jobs: python-version: ["3.8", "3.9", "3.10", "3.11"] steps: - name: Set up python ${{ matrix.python_version }} - uses: actions/setup-python@v3 + uses: actions/setup-python@v4 with: python-version: ${{ matrix.python_version }} - name: Checkout repository - uses: actions/checkout@v4 + uses: actions/checkout@v3 with: ref: ${{ github.event.pull_request.head.ref }} repository: ${{ github.event.pull_request.head.repo.full_name }} From cfd1380940c79d5488055747c0d7a11d62b9bce9 Mon Sep 17 00:00:00 2001 From: Gaurav Sheni Date: Thu, 15 Dec 2022 11:08:46 -0500 Subject: [PATCH 06/11] update owrkflows --- .github/workflows/build_docs.yml | 3 +-- .github/workflows/entry_point_test.yml | 4 ++-- .github/workflows/lint_check.yml | 2 +- .github/workflows/unit_tests_with_latest_deps.yml | 2 +- contributing.md | 3 +-- 5 files changed, 6 insertions(+), 8 deletions(-) diff --git a/.github/workflows/build_docs.yml b/.github/workflows/build_docs.yml index 60cdcec..a17ff83 100644 --- a/.github/workflows/build_docs.yml +++ b/.github/workflows/build_docs.yml @@ -28,8 +28,7 @@ jobs: sudo apt install -y graphviz pip config --site set global.progress_bar off python -m pip install --upgrade pip - python -m pip install -e . - python -m pip install -r dev-requirements.txt + python -m pip install .[dev] - name: Build docs run: | make -C docs/ html diff --git a/.github/workflows/entry_point_test.yml b/.github/workflows/entry_point_test.yml index 4a81264..afd8f00 100644 --- a/.github/workflows/entry_point_test.yml +++ b/.github/workflows/entry_point_test.yml @@ -21,11 +21,11 @@ jobs: ref: ${{ github.event.pull_request.head.ref }} repository: ${{ github.event.pull_request.head.repo.full_name }} - name: Build source distribution - run: make package_autonormalize + run: make package - name: Install package run: | pip config --site set global.progress_bar off python -m pip install --upgrade pip - python -m pip install -e unpacked_sdist/ + python -m pip install unpacked_sdist/ - name: Test entry point run: make entry-point-test diff --git a/.github/workflows/lint_check.yml b/.github/workflows/lint_check.yml index 7f7d22a..037fc04 100644 --- a/.github/workflows/lint_check.yml +++ b/.github/workflows/lint_check.yml @@ -27,6 +27,6 @@ jobs: run: | pip config --site set global.progress_bar off python -m pip install --upgrade pip - python -m pip install -e .[test] + python -m pip install .[dev] - name: Run lint test run: make lint diff --git a/.github/workflows/unit_tests_with_latest_deps.yml b/.github/workflows/unit_tests_with_latest_deps.yml index a61bece..22ff7ae 100644 --- a/.github/workflows/unit_tests_with_latest_deps.yml +++ b/.github/workflows/unit_tests_with_latest_deps.yml @@ -33,7 +33,7 @@ jobs: python -m pip install --upgrade pip sudo apt update sudo apt install -y graphviz - python -m pip install -e unpacked_sdist/[test] + python -m pip install "unpacked_sdist/[test]" - if: ${{ matrix.python_version == 3.8 }} name: Run unit tests with code coverage run: | diff --git a/contributing.md b/contributing.md index 46ec281..c6d5dee 100644 --- a/contributing.md +++ b/contributing.md @@ -25,8 +25,7 @@ There are many ways to contribute to AutoNormalize, with the most common ones be git clone https://github.com/alteryx/autonormalize.git python -m venv venv source venv/bin/activate - python -m pip install -e . - python -m pip install -r dev-requirements.txt + python -m pip install -e ".[dev]" ``` * If you plan to build the docs locally, you will have to install [git lfs](https://git-lfs.github.com/) because the demo data for the guides use git-lfs From fadf9112bf391ba2f33d07c144d2827eee410cf3 Mon Sep 17 00:00:00 2001 From: Gaurav Sheni Date: Thu, 15 Dec 2022 11:10:14 -0500 Subject: [PATCH 07/11] update release notes --- docs/source/release_notes.rst | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index 6f004b9..5a9226d 100755 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -3,15 +3,17 @@ Release Notes ------------- -.. Future Release - ============== +Future Release +============== * Enhancements * Fixes * Changes + * Transition to pyproject.toml, removing setup.py and setup.cfg (:pr:`17`) * Documentation Changes * Testing Changes -.. Thanks to the following people for contributing to this release: + Thanks to the following people for contributing to this release: + :user:`gsheni` v2.0.1 Apr 25, 2022 =================== From 16aa0832197330ed76ba5ab22de8e6f859d622e7 Mon Sep 17 00:00:00 2001 From: Gaurav Sheni Date: Thu, 15 Dec 2022 11:12:33 -0500 Subject: [PATCH 08/11] run precommit --- .flake8 | 2 +- .github/workflows/release_notes_updated.yml | 2 +- .../workflows/unit_tests_with_latest_deps.yml | 4 +- .pre-commit-config.yaml | 2 +- Makefile | 2 +- autonormalize/.gitignore | 2 +- autonormalize/__init__.py | 7 +- autonormalize/autonormalize.py | 12 +- autonormalize/classes.py | 2 +- autonormalize/dfd.py | 17 +- autonormalize/normalize.py | 8 +- autonormalize/tests/test_classes.py | 15 +- autonormalize/tests/test_example.py | 12 +- autonormalize/tests/test_normalize.py | 6 +- autonormalize/version.py | 1 + docs/source/_static/style.css | 2 +- docs/source/_templates/accessor_callable.rst | 2 +- docs/source/_templates/accessor_method.rst | 2 +- docs/source/_templates/data_check_class.rst | 2 +- docs/source/_templates/estimator_class.rst | 2 +- docs/source/_templates/pipeline_class.rst | 2 +- docs/source/_templates/transformer_class.rst | 2 +- docs/source/api_reference.rst | 2 +- docs/source/conf.py | 14 +- docs/source/guides/editing_dependencies.ipynb | 198 +-------------- docs/source/guides/kaggle_food_dataset.ipynb | 214 +--------------- .../guides/kaggle_liquor_sales_dataset.ipynb | 235 ++---------------- docs/source/install.rst | 2 +- presentation.key | Bin 2307885 -> 2306990 bytes pyproject.toml | 2 +- release/upload.sh | 2 +- 31 files changed, 109 insertions(+), 668 deletions(-) create mode 100644 autonormalize/version.py diff --git a/.flake8 b/.flake8 index b81e04f..a2be3c9 100644 --- a/.flake8 +++ b/.flake8 @@ -5,4 +5,4 @@ extend-ignore = E203 ignore = E501,W504,W503 per-file-ignores = **/__init__.py:F401 - **/tests/*:D \ No newline at end of file + **/tests/*:D diff --git a/.github/workflows/release_notes_updated.yml b/.github/workflows/release_notes_updated.yml index 01bb579..11dac4b 100644 --- a/.github/workflows/release_notes_updated.yml +++ b/.github/workflows/release_notes_updated.yml @@ -32,4 +32,4 @@ jobs: repository: ${{ github.event.pull_request.head.repo.full_name }} - if: ${{ steps.branch.outputs.is_dev == 'True' }} name: Check if release notes were updated - run: cat docs/source/release_notes.rst | grep ":pr:\`${{ github.event.number }}\`" \ No newline at end of file + run: cat docs/source/release_notes.rst | grep ":pr:\`${{ github.event.number }}\`" diff --git a/.github/workflows/unit_tests_with_latest_deps.yml b/.github/workflows/unit_tests_with_latest_deps.yml index 22ff7ae..b4dab5e 100644 --- a/.github/workflows/unit_tests_with_latest_deps.yml +++ b/.github/workflows/unit_tests_with_latest_deps.yml @@ -31,7 +31,7 @@ jobs: run: | sudo python -m pip config --site set global.progress_bar off python -m pip install --upgrade pip - sudo apt update + sudo apt update sudo apt install -y graphviz python -m pip install "unpacked_sdist/[test]" - if: ${{ matrix.python_version == 3.8 }} @@ -47,4 +47,4 @@ jobs: token: ${{ secrets.CODECOV_TOKEN }} fail_ci_if_error: true files: ${{ github.workspace }}/coverage.xml - verbose: true \ No newline at end of file + verbose: true diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e84f795..4c66ff1 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -34,4 +34,4 @@ repos: - id: black args: [--target-version=py311] additional_dependencies: [".[jupyter]"] - types_or: [python, jupyter] \ No newline at end of file + types_or: [python, jupyter] diff --git a/Makefile b/Makefile index b692a93..3934d17 100755 --- a/Makefile +++ b/Makefile @@ -51,4 +51,4 @@ package: upgradepip upgradebuild upgradesetuptools python -m build $(eval PACKAGE=$(shell python -c "from pep517.meta import load; metadata = load('.'); print(metadata.version)")) tar -zxvf "dist/autonormalize-${PACKAGE}.tar.gz" - mv "autonormalize-${PACKAGE}" unpacked_sdist \ No newline at end of file + mv "autonormalize-${PACKAGE}" unpacked_sdist diff --git a/autonormalize/.gitignore b/autonormalize/.gitignore index 5e680ca..f45482b 100644 --- a/autonormalize/.gitignore +++ b/autonormalize/.gitignore @@ -2,4 +2,4 @@ *.csv *.pyc -__pycache__/ \ No newline at end of file +__pycache__/ diff --git a/autonormalize/__init__.py b/autonormalize/__init__.py index 64c3900..8a929aa 100755 --- a/autonormalize/__init__.py +++ b/autonormalize/__init__.py @@ -1,5 +1,4 @@ # flake8: noqa -from .autonormalize import * -from .classes import Dependencies - -__version__ = "2.0.1" +from autonormalize.version import __version__ +from autonormalize.autonormalize import * +from autonormalize.classes import Dependencies diff --git a/autonormalize/autonormalize.py b/autonormalize/autonormalize.py index 70d3a92..1477738 100644 --- a/autonormalize/autonormalize.py +++ b/autonormalize/autonormalize.py @@ -1,7 +1,7 @@ import featuretools as ft -from . import dfd, normalize -from .classes import Dependencies +from autonormalize import dfd, normalize +from autonormalize.classes import Dependencies def find_dependencies(df, accuracy=0.98, index=None): @@ -108,7 +108,7 @@ def make_entityset(df, dependencies, name=None, time_index=None): # add relationship stack.append(child) relationships.append( - (child_df_name, child.index[0], current_df_name, child.index[0]) + (child_df_name, child.index[0], current_df_name, child.index[0]), ) return ft.EntitySet(name, dataframes, relationships) @@ -171,6 +171,10 @@ def normalize_entityset(es, accuracy=0.98): df = es.dataframes[0] new_es = auto_entityset( - df, accuracy, index=df.ww.index, name=es.id, time_index=df.ww.time_index + df, + accuracy, + index=df.ww.index, + name=es.id, + time_index=df.ww.time_index, ) return new_es diff --git a/autonormalize/classes.py b/autonormalize/classes.py index ba1b6e9..8d6ec00 100644 --- a/autonormalize/classes.py +++ b/autonormalize/classes.py @@ -313,7 +313,7 @@ def __str__(self): "prev": self.prev, "next": self.next, "loc": id(self), - } + }, ) diff --git a/autonormalize/dfd.py b/autonormalize/dfd.py index a28e473..8bbea13 100644 --- a/autonormalize/dfd.py +++ b/autonormalize/dfd.py @@ -4,7 +4,7 @@ import numpy from tqdm import tqdm -from .classes import DfdDependencies, LHSs, Masks, Node +from autonormalize.classes import DfdDependencies, LHSs, Masks, Node # see https://hpi.de/fileadmin/user_upload/fachgebiete/naumann/publications/2014/DFD_CIKM2014_p949_CRC.pdf for DFD paper # run script.py to see a couple examples @@ -97,7 +97,12 @@ def find_LHSs(rhs, attrs, df, partitions, accuracy, masks): node.infer_type() if node.category == 0: if compute_partitions( - df, rhs, node.attrs, partitions, accuracy, masks + df, + rhs, + node.attrs, + partitions, + accuracy, + masks, ): if node.is_minimal(): min_deps.add_dep(node.attrs) @@ -115,7 +120,7 @@ def find_LHSs(rhs, attrs, df, partitions, accuracy, masks): node = pick_next_node(node, trace, min_deps, max_non_deps, df.columns) seeds = nodes_from_seeds( - sorted(generate_next_seeds(max_non_deps, min_deps, lhs_attrs)) + sorted(generate_next_seeds(max_non_deps, min_deps, lhs_attrs)), ) return min_deps @@ -358,13 +363,15 @@ def approximate_dependencies(lhs_set, rhs, df, accuracy, masks): return False merged = df_lhs.merge( - df_lhs_rhs, indicator=True, how="outer" + df_lhs_rhs, + indicator=True, + how="outer", ) # create new df that is the merge of df_one and df_two indicator = merged[ merged["_merge"] == "right_only" ] # filter out the rows that were only on the right side (the rows that are preventing the two dataframes from being equal) indicator = indicator.drop_duplicates( - lhs_set + lhs_set, ) # find unique combinations of columns in LHS_set that characterize the disrepencies (have 2+ different values in rhs column) acc = 0 diff --git a/autonormalize/normalize.py b/autonormalize/normalize.py index bb730f8..514f5d2 100644 --- a/autonormalize/normalize.py +++ b/autonormalize/normalize.py @@ -1,6 +1,6 @@ import pandas as pd -from .classes import Dependencies +from autonormalize.classes import Dependencies def normalize(dependencies, df): @@ -106,7 +106,9 @@ def make_indexes(depdf): depdf.parent.df.drop(columns=prim_key, inplace=True) depdf.parent.df.insert( - len(depdf.parent.df.columns), "_".join(prim_key), add + len(depdf.parent.df.columns), + "_".join(prim_key), + add, ) for child in depdf.children: @@ -149,7 +151,7 @@ def split_up(split_on, depdf): child = DepDF(child_deps, form_child(depdf.df, child_deps), split_on, depdf) depdf.deps = parent_deps depdf.df = depdf.df.drop( - columns=list(set(depdf.df.columns).difference(parent_deps.all_attrs())) + columns=list(set(depdf.df.columns).difference(parent_deps.all_attrs())), ) depdf.children.append(child) normalize_dataframe(depdf) diff --git a/autonormalize/tests/test_classes.py b/autonormalize/tests/test_classes.py index d783d1a..dbf5784 100644 --- a/autonormalize/tests/test_classes.py +++ b/autonormalize/tests/test_classes.py @@ -90,7 +90,7 @@ def test_add_unique_lhs(): "rating", "experience", "mother", - ] + ], ) dependencies.add_unique_lhs("name") assert_equal_dependency_dics( @@ -122,8 +122,8 @@ def test_add_LHSs(): "rating", "experience", "mother", - ] - ) + ], + ), ) lhss_weight.add_dep(frozenset(["name"])) lhss_weight.add_dep(frozenset(["age", "height"])) @@ -139,8 +139,8 @@ def test_add_LHSs(): "rating", "experience", "mother", - ] - ) + ], + ), ) lhss_age.add_dep(frozenset(["name"])) dependencies = DfdDependencies( @@ -154,7 +154,7 @@ def test_add_LHSs(): "rating", "experience", "mother", - ] + ], ) dependencies.add_LHSs("age", lhss_age) assert_equal_dependency_dics( @@ -266,7 +266,8 @@ def test_find_candidate_keys(): dependencies = Dependencies(dep_dic) dependencies.remove_implied_extroneous() assert_equal_cand_keys( - dependencies.find_candidate_keys(), [{"A", "G"}, {"B", "G"}, {"E", "G"}] + dependencies.find_candidate_keys(), + [{"A", "G"}, {"B", "G"}, {"E", "G"}], ) diff --git a/autonormalize/tests/test_example.py b/autonormalize/tests/test_example.py index 164e091..9bfbe2c 100644 --- a/autonormalize/tests/test_example.py +++ b/autonormalize/tests/test_example.py @@ -17,21 +17,23 @@ def test_ft_mock_customer(): ) entityset = an.auto_entityset( - df, name="Customer Transactions", time_index="transaction_time" + df, + name="Customer Transactions", + time_index="transaction_time", ) assert set(entityset["transaction_id"].columns) == set( - ["transaction_id", "session_id", "transaction_time", "product_id", "amount"] + ["transaction_id", "session_id", "transaction_time", "product_id", "amount"], ) assert set(entityset["product_id"].columns) == set(["product_id", "brand"]) assert set(entityset["session_id"].columns) == set( - ["session_id", "customer_id", "device", "session_start"] + ["session_id", "customer_id", "device", "session_start"], ) assert set(entityset["customer_id"].columns) == set( - ["customer_id", "zip_code", "join_date", "birthday"] + ["customer_id", "zip_code", "join_date", "birthday"], ) assert set([str(rel) for rel in entityset.relationships]) == set( @@ -39,7 +41,7 @@ def test_ft_mock_customer(): " session_id.session_id>", " product_id.product_id>", " customer_id.customer_id>", - ] + ], ) diff --git a/autonormalize/tests/test_normalize.py b/autonormalize/tests/test_normalize.py index 54bcc4b..bfeaffa 100644 --- a/autonormalize/tests/test_normalize.py +++ b/autonormalize/tests/test_normalize.py @@ -206,13 +206,13 @@ def test_normalize_dataframe(): } assert new_dfs[0].equals( - normalize.drop_primary_dups(pd.DataFrame(dic_one), ["team", "jersey_num"]) + normalize.drop_primary_dups(pd.DataFrame(dic_one), ["team", "jersey_num"]), ) assert new_dfs[1].equals( - normalize.drop_primary_dups(pd.DataFrame(dic_two), ["team"]) + normalize.drop_primary_dups(pd.DataFrame(dic_two), ["team"]), ) assert new_dfs[2].equals( - normalize.drop_primary_dups(pd.DataFrame(dic_three), ["city"]) + normalize.drop_primary_dups(pd.DataFrame(dic_three), ["city"]), ) diff --git a/autonormalize/version.py b/autonormalize/version.py new file mode 100644 index 0000000..5719d15 --- /dev/null +++ b/autonormalize/version.py @@ -0,0 +1 @@ +__version__ = "2.0.1" \ No newline at end of file diff --git a/docs/source/_static/style.css b/docs/source/_static/style.css index c15288c..817782a 100644 --- a/docs/source/_static/style.css +++ b/docs/source/_static/style.css @@ -54,4 +54,4 @@ border-top: 2px solid white; margin-left: 7px; margin-right: 15px; -} } \ No newline at end of file +} } diff --git a/docs/source/_templates/accessor_callable.rst b/docs/source/_templates/accessor_callable.rst index 26b24fb..7a33018 100644 --- a/docs/source/_templates/accessor_callable.rst +++ b/docs/source/_templates/accessor_callable.rst @@ -3,4 +3,4 @@ .. currentmodule:: {{ module.split('.')[0] }} -.. autoaccessorcallable:: {{ (module.split('.')[1:] + [objname]) | join('.') }}.__call__ \ No newline at end of file +.. autoaccessorcallable:: {{ (module.split('.')[1:] + [objname]) | join('.') }}.__call__ diff --git a/docs/source/_templates/accessor_method.rst b/docs/source/_templates/accessor_method.rst index d196e10..aefbba6 100644 --- a/docs/source/_templates/accessor_method.rst +++ b/docs/source/_templates/accessor_method.rst @@ -3,4 +3,4 @@ .. currentmodule:: {{ module.split('.')[0] }} -.. autoaccessormethod:: {{ (module.split('.')[1:] + [objname]) | join('.') }} \ No newline at end of file +.. autoaccessormethod:: {{ (module.split('.')[1:] + [objname]) | join('.') }} diff --git a/docs/source/_templates/data_check_class.rst b/docs/source/_templates/data_check_class.rst index d59021c..d1b7e9b 100644 --- a/docs/source/_templates/data_check_class.rst +++ b/docs/source/_templates/data_check_class.rst @@ -27,7 +27,7 @@ {%- endif %} {%- endfor %} {% endblock %} - + {% block methods %} {% if methods %} .. rubric:: Methods: diff --git a/docs/source/_templates/estimator_class.rst b/docs/source/_templates/estimator_class.rst index 727a23a..6ffa503 100644 --- a/docs/source/_templates/estimator_class.rst +++ b/docs/source/_templates/estimator_class.rst @@ -29,7 +29,7 @@ {%- endif %} {%- endfor %} {% endblock %} - + {% block methods %} {% if methods %} .. rubric:: Methods: diff --git a/docs/source/_templates/pipeline_class.rst b/docs/source/_templates/pipeline_class.rst index 049a4e0..d9db3da 100644 --- a/docs/source/_templates/pipeline_class.rst +++ b/docs/source/_templates/pipeline_class.rst @@ -36,7 +36,7 @@ {%- endif %} {%- endfor %} {% endblock %} - + {% block methods %} {% if methods %} .. rubric:: Methods: diff --git a/docs/source/_templates/transformer_class.rst b/docs/source/_templates/transformer_class.rst index 6b421ff..9c5e4d2 100644 --- a/docs/source/_templates/transformer_class.rst +++ b/docs/source/_templates/transformer_class.rst @@ -28,7 +28,7 @@ {%- endif %} {%- endfor %} {% endblock %} - + {% block methods %} {% if methods %} .. rubric:: Methods: diff --git a/docs/source/api_reference.rst b/docs/source/api_reference.rst index 88c3af9..cb0749d 100755 --- a/docs/source/api_reference.rst +++ b/docs/source/api_reference.rst @@ -39,4 +39,4 @@ Dependencies Dependencies.find_candidate_keys Dependencies.find_partial_deps Dependencies.find_trans_deps - Dependencies.equiv_attrs \ No newline at end of file + Dependencies.equiv_attrs diff --git a/docs/source/conf.py b/docs/source/conf.py index 74c93a9..398c97f 100755 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -12,16 +12,15 @@ # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. -import autonormalize import os -import sys -import subprocess import shutil +import subprocess +import sys from pathlib import Path -from sphinx.ext.autodoc import Documenter, MethodDocumenter +from sphinx.ext.autodoc import Documenter, MethodDocumenter -from sphinx.ext.autodoc import MethodDocumenter, Documenter +import autonormalize path = os.path.join("..", "..") sys.path.insert(0, os.path.abspath(path)) @@ -282,10 +281,11 @@ def setup(app): ipython_p.mkdir(parents=True, exist_ok=True) file_p = os.path.abspath(os.path.dirname(__file__)) shutil.copy( - file_p + "/set-headers.py", home_dir + "/.ipython/profile_default/startup" + file_p + "/set-headers.py", + home_dir + "/.ipython/profile_default/startup", ) app.add_js_file( - "https://cdnjs.cloudflare.com/ajax/libs/require.js/2.1.10/require.min.js" + "https://cdnjs.cloudflare.com/ajax/libs/require.js/2.1.10/require.min.js", ) app.add_css_file("style.css") app.add_autodocumenter(AccessorCallableDocumenter) diff --git a/docs/source/guides/editing_dependencies.ipynb b/docs/source/guides/editing_dependencies.ipynb index 75b10c6..6b355f6 100644 --- a/docs/source/guides/editing_dependencies.ipynb +++ b/docs/source/guides/editing_dependencies.ipynb @@ -16,7 +16,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -26,100 +26,9 @@ }, { "cell_type": "code", - "execution_count": 2, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
teamcitystateroster_size
0tigersbostonMA20
1elephantschicagoIL21
2foxesmiamiFL20
3snakesaustinTX20
4dolphinshonoluluHI19
5eagleshoustonTX21
\n", - "
" - ], - "text/plain": [ - " team city state roster_size\n", - "0 tigers boston MA 20\n", - "1 elephants chicago IL 21\n", - "2 foxes miami FL 20\n", - "3 snakes austin TX 20\n", - "4 dolphins honolulu HI 19\n", - "5 eagles houston TX 21" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "rows = [\n", " [\"tigers\", \"boston\", \"MA\", 20],\n", @@ -142,34 +51,9 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 2/2 [00:00<00:00, 318.27it/s]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " {city} --> team\n", - " {team} --> city\n", - " {team} {city} --> state\n", - " {team} {city} --> roster_size\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], + "outputs": [], "source": [ "deps = an.find_dependencies(df)\n", "print(deps)" @@ -184,20 +68,9 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " --> team\n", - " {team} --> city\n", - " {team} {city} --> state\n", - " {team} --> roster_size\n" - ] - } - ], + "outputs": [], "source": [ "deps.remove_dep(\"team\", [\"city\"])\n", "deps.remove_dep(\"roster_size\", [\"city\"])\n", @@ -206,60 +79,9 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "team\n", - "\n", - "team (6 rows)\n", - "\n", - "team : Unknown; index\n", - "city : Unknown; foreign_key\n", - "roster_size : Integer\n", - "\n", - "\n", - "\n", - "city\n", - "\n", - "city (6 rows)\n", - "\n", - "city : Unknown; index\n", - "state : Unknown\n", - "\n", - "\n", - "\n", - "team->city\n", - "\n", - "\n", - "city\n", - "\n", - "\n", - "\n" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "es = an.make_entityset(df, deps)\n", "es.plot()" diff --git a/docs/source/guides/kaggle_food_dataset.ipynb b/docs/source/guides/kaggle_food_dataset.ipynb index 2ea1363..edaae8d 100644 --- a/docs/source/guides/kaggle_food_dataset.ipynb +++ b/docs/source/guides/kaggle_food_dataset.ipynb @@ -17,7 +17,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -31,140 +31,9 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Area AbbreviationArea CodeAreaItem CodeItemElement CodeElementUnitlatitudelongitude
0AFG2Afghanistan2511Wheat and products5142Food1000 tonnes33.9467.71
1AFG2Afghanistan2805Rice (Milled Equivalent)5142Food1000 tonnes33.9467.71
2AFG2Afghanistan2513Barley and products5521Feed1000 tonnes33.9467.71
3AFG2Afghanistan2513Barley and products5142Food1000 tonnes33.9467.71
4AFG2Afghanistan2514Maize and products5521Feed1000 tonnes33.9467.71
\n", - "
" - ], - "text/plain": [ - " Area Abbreviation Area Code Area Item Code \\\n", - "0 AFG 2 Afghanistan 2511 \n", - "1 AFG 2 Afghanistan 2805 \n", - "2 AFG 2 Afghanistan 2513 \n", - "3 AFG 2 Afghanistan 2513 \n", - "4 AFG 2 Afghanistan 2514 \n", - "\n", - " Item Element Code Element Unit latitude \\\n", - "0 Wheat and products 5142 Food 1000 tonnes 33.94 \n", - "1 Rice (Milled Equivalent) 5142 Food 1000 tonnes 33.94 \n", - "2 Barley and products 5521 Feed 1000 tonnes 33.94 \n", - "3 Barley and products 5142 Food 1000 tonnes 33.94 \n", - "4 Maize and products 5521 Feed 1000 tonnes 33.94 \n", - "\n", - " longitude \n", - "0 67.71 \n", - "1 67.71 \n", - "2 67.71 \n", - "3 67.71 \n", - "4 67.71 " - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "food_df = load_sample()\n", "food_df.head()" @@ -172,97 +41,36 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 10/10 [00:03<00:00, 3.28it/s]\n" - ] - } - ], + "outputs": [], "source": [ "entityset = an.auto_entityset(food_df, name=\"Foods\")" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Entityset: Foods\n", - " DataFrames:\n", - " Area Code_Element Code_Item Code [Rows: 21477, Columns: 4]\n", - " Element Code [Rows: 2, Columns: 2]\n", - " Item Code [Rows: 117, Columns: 2]\n", - " Area Code [Rows: 174, Columns: 5]\n", - " Area Abbreviation [Rows: 169, Columns: 2]\n", - " Relationships:\n", - " Area Code_Element Code_Item Code.Area Code -> Area Code.Area Code\n", - " Area Code_Element Code_Item Code.Item Code -> Item Code.Item Code\n", - " Area Code_Element Code_Item Code.Element Code -> Element Code.Element Code\n", - " Area Code.Area Abbreviation -> Area Abbreviation.Area Abbreviation" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "entityset" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "image/svg+xml": "\n\n\n\n\n\nFoods\n\n\n\nArea Code_Element Code_Item Code\n\nArea Code_Element Code_Item Code (21477 rows)\n\nArea Code_Element Code_Item Code : Integer; index\nArea Code : Integer; foreign_key\nItem Code : Integer; foreign_key\nElement Code : Integer; foreign_key\n\n\n\nElement Code\n\nElement Code (2 rows)\n\nElement Code : Integer; index\nElement : Unknown\n\n\n\nArea Code_Element Code_Item Code->Element Code\n\n\nElement Code\n\n\n\nItem Code\n\nItem Code (117 rows)\n\nItem Code : Integer; index\nItem : Unknown\n\n\n\nArea Code_Element Code_Item Code->Item Code\n\n\nItem Code\n\n\n\nArea Code\n\nArea Code (174 rows)\n\nArea Abbreviation : Unknown; foreign_key\nArea Code : Integer; index\nArea : Unknown\nlatitude : Double\nlongitude : Double\n\n\n\nArea Code_Element Code_Item Code->Area Code\n\n\nArea Code\n\n\n\nArea Abbreviation\n\nArea Abbreviation (169 rows)\n\nArea Abbreviation : Unknown; index\nUnit : Categorical\n\n\n\nArea Code->Area Abbreviation\n\n\nArea Abbreviation\n\n\n\n", - "text/plain": [ - "" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "entityset.plot()" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[,\n", - " ,\n", - " ,\n", - " ,\n", - " ,\n", - " ,\n", - " ,\n", - " ,\n", - " ]" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "import featuretools as ft\n", "\n", diff --git a/docs/source/guides/kaggle_liquor_sales_dataset.ipynb b/docs/source/guides/kaggle_liquor_sales_dataset.ipynb index 327a4d6..42de53f 100644 --- a/docs/source/guides/kaggle_liquor_sales_dataset.ipynb +++ b/docs/source/guides/kaggle_liquor_sales_dataset.ipynb @@ -17,7 +17,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -31,131 +31,9 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Rows: 1000\n", - "Columns: 12\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Invoice/Item NumberDateStore NumberStore NameAddressCityZip CodeStore LocationCounty NumberCountyCategoryCategory Name
0S2886570000111/09/20152538Hy-Vee Food Store #3 / Waterloo1422 FLAMMANG DRWATERLOO507021422 FLAMMANG DR\\nWATERLOO 50702\\n(42.459938, ...7Black Hawk1701100DECANTERS & SPECIALTY PACKAGES
1S2933930009111/30/20152662Hy-Vee Wine & Spirits / Muscatine522 MULBERRY, SUITE AMUSCATINE52761522 MULBERRY, SUITE A\\nMUSCATINE 52761\\n70Muscatine1701100DECANTERS & SPECIALTY PACKAGES
2S2886690000111/11/20153650Spirits, Stogies and Stuff118 South Main St.HOLSTEIN51025118 South Main St.\\nHOLSTEIN 51025\\n(42.490073...47Ida1701100DECANTERS & SPECIALTY PACKAGES
\n", - "
" - ], - "text/plain": [ - " Invoice/Item Number Date Store Number \\\n", - "0 S28865700001 11/09/2015 2538 \n", - "1 S29339300091 11/30/2015 2662 \n", - "2 S28866900001 11/11/2015 3650 \n", - "\n", - " Store Name Address City \\\n", - "0 Hy-Vee Food Store #3 / Waterloo 1422 FLAMMANG DR WATERLOO \n", - "1 Hy-Vee Wine & Spirits / Muscatine 522 MULBERRY, SUITE A MUSCATINE \n", - "2 Spirits, Stogies and Stuff 118 South Main St. HOLSTEIN \n", - "\n", - " Zip Code Store Location County Number \\\n", - "0 50702 1422 FLAMMANG DR\\nWATERLOO 50702\\n(42.459938, ... 7 \n", - "1 52761 522 MULBERRY, SUITE A\\nMUSCATINE 52761\\n 70 \n", - "2 51025 118 South Main St.\\nHOLSTEIN 51025\\n(42.490073... 47 \n", - "\n", - " County Category Category Name \n", - "0 Black Hawk 1701100 DECANTERS & SPECIALTY PACKAGES \n", - "1 Muscatine 1701100 DECANTERS & SPECIALTY PACKAGES \n", - "2 Ida 1701100 DECANTERS & SPECIALTY PACKAGES " - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "df = load_sample()\n", "print(\"Rows: \" + str(df.shape[0]))\n", @@ -165,41 +43,16 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "data": { - "text/plain": [ - "Invoice/Item Number object\n", - "Date object\n", - "Store Number int64\n", - "Store Name object\n", - "Address object\n", - "City object\n", - "Zip Code object\n", - "Store Location object\n", - "County Number int64\n", - "County object\n", - "Category int64\n", - "Category Name object\n", - "dtype: object" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "df.dtypes" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -215,27 +68,9 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 11/11 [00:08<00:00, 1.32it/s]\n" - ] - }, - { - "data": { - "text/plain": [ - "17.265804052352905" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "start = time.time()\n", "entityset = an.auto_entityset(df, accuracy=0.96, name=\"liquor orders\")\n", @@ -251,23 +86,9 @@ }, { "cell_type": "code", - "execution_count": 8, - "metadata": { - "scrolled": false - }, - "outputs": [ - { - "data": { - "image/svg+xml": "\n\n\n\n\n\nliquor orders\n\n\n\nInvoice/Item Number\n\nInvoice/Item Number (1000 rows)\n\nInvoice/Item Number : Unknown; index\nCategory_Date : Integer; foreign_key\n\n\n\nCategory_Date\n\nCategory_Date (968 rows)\n\nCategory_Date : Integer; index\nDate : Datetime\nStore Number : Integer; foreign_key\nCategory : Integer; foreign_key\n\n\n\nInvoice/Item Number->Category_Date\n\n\nCategory_Date\n\n\n\nStore Number\n\nStore Number (537 rows)\n\nStore Number : Integer; index\nStore Name : Unknown\nAddress : Unknown\nZip Code : Unknown; foreign_key\nStore Location : Unknown\n\n\n\nCategory_Date->Store Number\n\n\nStore Number\n\n\n\nCategory\n\nCategory (56 rows)\n\nCategory : Integer; index\nCategory Name : Unknown\n\n\n\nCategory_Date->Category\n\n\nCategory\n\n\n\nZip Code\n\nZip Code (229 rows)\n\nCity : Unknown; foreign_key\nZip Code : Unknown; index\n\n\n\nStore Number->Zip Code\n\n\nZip Code\n\n\n\nCity\n\nCity (190 rows)\n\nCity : Unknown; index\nCounty Number : Integer; foreign_key\n\n\n\nZip Code->City\n\n\nCity\n\n\n\nCounty Number\n\nCounty Number (86 rows)\n\nCounty Number : Integer; index\nCounty : Unknown\n\n\n\nCity->County Number\n\n\nCounty Number\n\n\n\n", - "text/plain": [ - "" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "entityset.plot()" ] @@ -281,35 +102,9 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Entityset: liquor orders\n", - " DataFrames:\n", - " Invoice/Item Number [Rows: 1000, Columns: 2]\n", - " Category_Date [Rows: 968, Columns: 4]\n", - " Store Number [Rows: 537, Columns: 5]\n", - " Zip Code [Rows: 229, Columns: 2]\n", - " City [Rows: 190, Columns: 2]\n", - " County Number [Rows: 86, Columns: 2]\n", - " Category [Rows: 56, Columns: 2]\n", - " Relationships:\n", - " Invoice/Item Number.Category_Date -> Category_Date.Category_Date\n", - " Category_Date.Category -> Category.Category\n", - " Category_Date.Store Number -> Store Number.Store Number\n", - " Store Number.Zip Code -> Zip Code.Zip Code\n", - " Zip Code.City -> City.City\n", - " City.County Number -> County Number.County Number" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "entityset" ] diff --git a/docs/source/install.rst b/docs/source/install.rst index 1091fa2..5131218 100644 --- a/docs/source/install.rst +++ b/docs/source/install.rst @@ -16,7 +16,7 @@ In order to use plotting you will need to install the graphviz library. pip users:: pip install graphviz - + conda users:: conda install -c conda-forge python-graphviz diff --git a/presentation.key b/presentation.key index cf70a225c944aa8bc4c6bf414d65b81c85bf586e..473fbdd4e3121e0fad7fff7c0d9d169c042f63a5 100644 GIT binary patch delta 11957 zcmd^Fd6ZP;mFK;xZmO&IuHL)4mZpIoWO@6m$6^OiP__mGvIS8PWF(9W6H!KKKoGQ9 z=LH^-CJLA!iy%DWWq{*An80MnA&>}$U?a)|(36ZHZWz(Iud2GL-u-&c@&D;_#6Gva z^r+1^O$^(Hv3YKt6uv*`xNo!d2y&6mA>HYN@c9ao})ML=6Yke z`SsoPc6Q%Vk6klhCy%EMGoJ`WZTSZt-qr`9uX-~qF|KWx^8v0-LthQ+gy94n>l?;y}f+PDX%$x+x_~=9Bj(>eZM1vc?4M;n`qkVb>!Oca0@HK2 ztWsySwX&9b!8+QPtARctZnhsbHgJ3MoNd+|l^X)`b7# zvx+($Sv1-@)5?de1^xJ4a7TasnF{#D5Ba5DxMKo;ohwPKzSh3F;HeyEc{SWSiC+-u z9RKuoSL^hA{v@>i3%{cnhOXr27Usr3@aJ=1&FEvDXys4A`c3?2dC;S*x zED6@EF8oy;yzzK`j|K;TJsZG#G=Ct+$t;ppnR?5CVHyDO)o~b-!yFhgqL`zs^OH&$QBW!75Yhzsh;fks z0N12r=1;&7mDR{HS&eDrN*Pm$<{i(E&Vk-?!KPU$h$#$*X{Z=+s~}JgK&LYh(_#kMh>li>NE8Ie`7Vgb zh9nY)NwRJbgG59aaabbwiGPz269q9w@FeJxPB%6i1_L{x1B%x)Mt@l$=_U#qYh{e4 z1FD~nM-pP_oft7(R5X#q8-1Y@lO#o=R!WL05XBH;MhXInIwl)hyr{S#X3$uw3d8y+ z(iqe!>5U19&G~u3!u1ZjJrag-V+;SAO9BlmDop@EQ<-=gnof-G5NImR903b94KXGY z1)3xxAl=x;yrlJ6)|iwNG=tV3!O$cU6)Y?n#Vig>hs!97h=M{{5b;mTlq881LK23Z zT#33e^rK4|t{xs&L)KM^1W(bl7{Ln$iB@qGrKujOClvlfPgkuWS+D{Mw-tbLyl`>X zWYj~sMVHf2N9G{@)+AbWkb8BaT!&GYX!C%17o%{&B*Tn#(F?W{BC%9L>j_@4Cu15Z z4P``qLrz0J*%}%O?Fs~kUKrw#L{b7!3rZZ4Dl)pNat3iE!l?s_XoW;k+9V63u0lm&(bqqKlF>zHr?scFsL`4u$T}@nf)k$ z2pA$gOf+PMqGgFT`7)+?`uJpUk==>13btj$7bv0?#%Y{W5lIC1_Wq)-o7=-G7#Z{t zhnt{a$Kke4(ADn~Rz+c$TEQSPXE9-wRqUAD%Orf4k(`Qa=UyeFN15|=CTEox6Z~df zWulL8CO#H$BS~W+rqK5m>MErJ7FQEnOr4-c5+xzMD%o|4O#6j+U}#^$K8hGFF^P+#aqgf=t9skiLS+H z(N8vsj^9#Q*I~m+yM1gpS;jV;?$1>bL|vg-A*n)|@{+_Jzds(lb}d9XMw1R3PU>WA zIMW%HJPRv=N@9Ul(%i?^fPw_OOPY^j2hFh}QMSZ}GsCY5bcu?TA&Z+PDhyQ7J(Ncf z^f9V_Or*?o@4q>Egl;T-*W@v$*b zI<#aS2P^I=@re%H1_t2Vijs%gAJPu!i1JAFj3e-6LR3IQ80J9wqCEkEE`|B_o*+Jz zaf1%AL(WHk(U)Apd2SaX>d5&H=qMK==!jFDHpF38=O=|=1RWz%==;b49kCKYM;RZ{ zB#2YRTvZoT=>MItFS*H!N;>4gO*21_k0lqU#m=HKn4<*k9 z?(o8$ivkb&;FT8xr%T}R*1)_5xb`1`PxIlz9|A{mq4i=w&w_U@2PVrPRFn>_gs1zI zepd)jH14q)75Cs{#=G4A;GGV{KG>09f zG|SYOm&8Oo%Xz7};(;9B!LyjQefS1Wm&9c8kyUmT(koRTdqE;<#>ZOPJ4q|m4xV%x zU8Q3y>pl$CrzwY3g+8$n)JB@#VhNT~4Z2fO4QAejUnZm?0?kEbF%=HnUG_-Cip(rq z1$$Ny02yp~V>IQ4Gg*6i?(5x21xNF(4wGOrUdJ@kE}bg?QBV`eSQ_^lO1nvQdA=1QhM<70)GB;lRRi_DJad&!|@z!zf?ADnw2-W{5#0It9;M6_8@mKLQr(Z zm)OJ*mcSq|br`#F8Uz(vF1Ha#dk#+BLB`Tdc48T(jgx=_skE08fYf8OXd!RY9#z1n z8cn_Cm_m61ucR4@?HdiB^FSabAq!E!poo@c97tzy4j_q+@xiNG%P$r~^RLR64NOG@ z>osv1)_Dp@Ri)Lbqlh^0pl4wpL`2y4F=?l29YmZGBu))7OiDyVVj5njPE(6i=8Y{h zYdFp&n&Hfa=r0-)9Rp)8L-`jc_!Nq`ibhvL45xjOOgprQU^YP8}y2Y>2PvAs`wJSBs9=io4xz+FIplWRMSCtFq|b|#~0&ZFF&VfckB@nk$) zZhwo|15z#Un_yZ6FnIJVm2G~THJLFSlY*R&SJH2>aDq=u3Pzp99&!s+ABRyXQ8J#A zPAd=o(LlO~O%#6=r7J@sp3je~U}@A^bD`1;4^FPw_tRu}Q8Q*HCb7d?ti;rd9S|sL zK{G%WMJmEVkA}O$tFg;u=IcRxHq&u|pfQapR&-`ohn$g4lsItkF9dKBlX7-KhO^aF zRwNvYbGdX=X#HqKr3%Y$R1}WL@E-Nc{}b=wz?k(O1L%xeknmo9V`V|6_h>WPvG?ez z?~Of@t%x9}cVVzcmm&7(m8wC%OLMSCHzM{ZGvark)Kf^Qlr0fx8C>2DM?FOcfY|Gi z1M9T&fi?UZQXewc#m}^Cp-{t2dl1T!C&gCmX*UCqw0w+ z>7|Jt%OMB9H9Jh1(6X?q>m>KuDLh*oIK1P|HAs&~n5>61L%zEyofa!X@g>)8dN z^)Oqid023NG|yvQI#aX8TC=gnfNPJ`j3=LW=b>?9O|$6Ub*~q2*S9`+|IalWvf$}U zH81r8PcXQ%%)M@<2THZz?h-h5Pw<6Y7&bO2cY_7we~eRj#WJoje-4|{Py4T*~Vd>oP_Wo|^2OhY#Km1~~TiU`wKY#QY z9!9+r{-79yE8!Q5;oz6y?gS5Si(XG(-Zsc6Pd*!Pf^o~VE9;SR8S(V{ADyrXPF2)>SOz3_^)70$g;H?{H_+@w$tTbN>GY07jm3 delta 12680 zcmds7dzci}v8Q|4WoCEwIs0JQ$MmisE28v!reP6R1q>)Jm4^rj(TMSdV6tCCT~HKJ zM93(O4^&Ws3O;bulPEC@M7fHB++d5fxDcy}z?PJKcL~^vmCOzmIA9be*a? zRlllJr@9~iWU=(v=ZmE$=NCvyYJR~91yVus@ydNtB)Ot$xD-ykTQxNoH-qawz^_t2 zLJB933~5N+TK`3k6iLl(d|k%)$UgtVjd|>r9Lzp++&dWRe|A&qNWZ^ymkLtP4=CY8 z+qB`u;rt#U>4fqdTl7$Vp7daHY5YMua!SYew$on0(>sTHv*MUgHS8eQCNg3R9_$z~15>n- zCw9lpw9&iqJ8;Yd3@sb;rVOf!&jMAd|JmK4Z&IE3kfbDcO>BV%6_Xa>!F`j?z{J$P zDg7lpubeg+Gq0T1!VUA(1*1VGT^QrKTzDH_!ecXyJP^Eh`ThK^ZT0~!bXMyWjJ?*n zwFfSDw3YHTt+~DyD^G5^VI6);?o^VC7o3WxI~FYEPp91+<14=Imhlj9$=X=zt6RS3 zoH$|8(i1T_^-eW)>C%V1!BLnw_0HS5 zJ0;B6{-;au>pb@;W_yAAAsUtCRe-L2grwneV1(3!OZ7;p4i|f*G^!wyUlWQo z93I!LG~(XojXye4dH|0{oY667?itd3au}1&4SU0B5sKl-~-0?iX?S2+YlfI`Tp(s^k}iq_THLM{t#MOS-ksALEX>TCR7$A1JTK z?6rgB-&NtNoF(7b9bDH;kiU?^=~2&C9{R?rh(@+|j!* z?wpf%4`1!B@p&&#!gb}7d7F=clxe&2RxH5v(8L~hjt1T0hkHD+QVK5|G%Umh4GT{> z2V*Pl36GwF>nGoaZySf}RU`9n0=n4YQTg{4LiY0W@;^O*`|Cy(OiqB;TQ?QtRLF|L z-oB^cnnDare57FJWw^FiMj~@C;nhWv=cf%;@>Tw4nRYy(bo^JjJUynyHA5A~G*!3F z%$PglKk{mh2eOiC_ED&?xeQQis42|Vah<-T_?bpL-F-!2za|fY{hOH9Uid3H2t;wA zKLtczum1*!mS`&uL{XHk>ICzKqgf&y2dwC$tUxp@BVa2J9UO>Rq9WjQch#1{AP|Y3 zz@bGz2P})22f8)U_n;zJf||BS*JfA=TU@w-s1z3tI+M`Zsv{a4hBU}r$EN(8gpI`! zAX?(;pi08m+THor!VD0rUo33k*>0d-v~s2oE;Eq5($27H!GK@4v=x+dC5H{y=J1WzRm*b?oO6paXl_hRvjk*ICt+oMFDWxHr3rQ z{n$$Dq}o~5m>@v_-3>8A!#W9Z-EEp_Yrc&=AuvW-`9Gh$_ zhU4g%D}u`iINVdlG7Z}YoLjoGC~vit(9z*)Q55u~=^?HY6B}_$_nqabZrXN2L`ft3Lw0bC1rj{l zKfd@kIKKsVt6{n9yl0Eo7NNe%d9nEVJ{jNd50`up@ICT|Xnb2I8xQht+*h{4_s4Y> zQMOJB2Si>X0F{h1yvZSn^YTvqPC8~gQE*9RL3M0KpW2}G;Qdb znDU_vNgKMQQKc<|qzx!diX+3eI7GWfuVJIi6tu?95Gan^D+^1a6=@rr7BHP5cj0b} zy;@ju_+)GsFO4dBV@9VlP;>>hmvlTBalSXDE#z+6P{Ol9mpa&plU#yofg~c;PFTc* ziMU35XXvDSQ{$*zGFo_=wW+2?GAe`^A|b+@)P8^m+3M9ewnjmSmawVc9gXv=>4F-~ zF#_q>J@8aX^uo0L4Ft8w*FgFf^)yj-iN?Sf*-ChA90;clf38TC^jVe3$v~?JOw}}L zPL5lN0HZEWHBFk6O?XhSj~cg#QwJ3wv7&9&ghmcGYQjX}-&H>J)J0#RevFIZ)~0K^DiT6>Qd$|=lDv*;Ne~RWEMvAQ z=2z6CJkEuLU?uoThW}*_GCZ63hIDt1FO6p6Ty^N4fYj33g2t0Fa+d(g`N6Wl75psp+)1Ng_eF)IerS^Xsre zS6j4cMuLPJib-N!6qA9_N;7mEOEfOl7>IU{4!b{riH6)myrpF;flWP-Fx_Qe`ViMl zJy1ENWtkMFFc3)}0DXe!p3q3ng&|34HI*Oq#hBGCxIsi3&_v*j%1VquU?xy*c@TT_ ztI{oF(cQUhXjvByduhNF_s83!8x*l7%g zZcU%NJ2yx?Bvw|9IH~4%HQ`&WE4px4@*kyJXnyU8!t#i;~5%C0wJ(qP?8#p zR1<}4hS1&FFPd>O_>#>Fg-*5&l4LX-q{!y3<4;W2IJA(!ZiDi~b_Vj1h+1rrB-G$M zLllBBa)^M>f(?eNW>3HP(hFzjH#aK--0#L0H?gC~NB_472bCQIqfh1HI{)P8lp-{* z_l$}z!#U=P(NP(1g4mVgqkrrzD+TPhxzXdhBjudk7Ck=%`llC0UoXS`g1e)wyruvB z=IEEakNnx|(e?sdAA36*&*dpc16U|r=U zK4n1;ZrB+DhvRf2=}6%nQc=&jV+$)Xu2T(*)QKp9$d0bss1%9#;2AM3%C-XMyUwwM z9k5`B9&K*!VmvZj#ak%>ej8n#Ks(6OM#wn>up zJlg5kv>nhi67$~(q(ri#p)Hc7nQPMy3|?dTz;ieJ!v~M9(h`xMZp8KIkVQ5l?Bj$E zkPxdpoOg(Q8Zk)nlWqp%-t+6SH!B{&>y)mq9~oXhPLS9|dn#tkkP6(FPpGgkF zDxWwfusTMt~vN z0Flp0&9CB&Cq^Z9i9qsj=dY;DbRBTYAU;I1n#87V*a5htF^M8j)Dykx;C8ZNO}WLI z?y9T{$%S~Gy{d9gDfYT!Qk4r^e5arU5Tgg>gAdM5t5^YWbae^`31NYfN*qJ=NCMEZ z1HMj+;;|>pMZ+PmFptiLKeKobrzByrY6Xwz?y7^8ex^p3M$}z%^gxH$lZBcn(7Edz zLyuMv1v-+IbrtWx1Uht7Y4wD$08l(3I?6WR%U%H9@lE9(Ui{3CReizFV>2469vRTt zH_&xQB^ed%USCsJ8(T&CgL;-7Mn^wJ&>NXR;iE1b!#n@<5@oJc;3x5%Y zR-E+buzk{{A}B;55KLZiq0`Sd6tO=DUu9P8P9Ikl9h>eC!m;Eug2J2ZJot=&6n+>B zYzhpU6SoJPjcH)p zS?CPS2yJ$UsoyB3oF6;KD+u# zd}-BnM!Jv!%hh?9vu8&2TfH!cU0dDf+-$T{WLCVp`rT6S`+R@(raswd*2t`1Wv?ig zIkE^$ZNHSe+EQR_kmSat{ci1dd}Rs(BN!@#X0Phjdh1Ap{nz@y4xe! zXa>t{_xp7ZX9cb&NN2qZ>W!E<`6q01L;w0;)JGH;GmERgR)%=WaNyuH01*)`Z;4 z-in3UAXOPhK3?_p`-FWkRGGsgc>E|O##@6~7Q&ECx=ZZS=}L1}Ox%^j-a9ikP-3rL ztGo%oQyw!V7y3onH@{c@ZDPTDwF)jUh6mZTyOh79 z??~w%kk}6g6-{P`KUS6y9Po8|6mzmJ&2z!;WHvI~a1{N_to&c7YIyf&IIou7x0cje zY~OJW=VgT~6ab4QIqtlp$_cDB(QxZYFt^;=FmWX|oO8CacQh5gx R1}0z0Kf`~==Egm<{tvR|(1QQ~ diff --git a/pyproject.toml b/pyproject.toml index 32e4e52..0108b8a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -136,4 +136,4 @@ requires = [ "setuptools >= 61.0.0", "wheel" ] -build-backend = "setuptools.build_meta" \ No newline at end of file +build-backend = "setuptools.build_meta" diff --git a/release/upload.sh b/release/upload.sh index b5e7a3d..a24084f 100755 --- a/release/upload.sh +++ b/release/upload.sh @@ -13,4 +13,4 @@ python setup.py sdist bdist_wheel pip install --user twine # Upload to pypi or testpypi echo "Upoading to ${2:-pypi} . . ." -python -m twine upload dist/* -r "${2:-pypi}" \ No newline at end of file +python -m twine upload dist/* -r "${2:-pypi}" From 8885b7b8656e14cfb4d0a74e8e645462f9e50d5a Mon Sep 17 00:00:00 2001 From: Gaurav Sheni Date: Thu, 15 Dec 2022 11:13:44 -0500 Subject: [PATCH 09/11] release notes --- docs/source/release_notes.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index 5a9226d..6f1f3c0 100755 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -8,7 +8,7 @@ Future Release * Enhancements * Fixes * Changes - * Transition to pyproject.toml, removing setup.py and setup.cfg (:pr:`17`) + * Transition to pyproject.toml, removing setup.py and setup.cfg (:pr:`54`) * Documentation Changes * Testing Changes From 7b7acf552092a4d41c886ef9f667a62d82d92460 Mon Sep 17 00:00:00 2001 From: Gaurav Sheni Date: Thu, 15 Dec 2022 11:17:12 -0500 Subject: [PATCH 10/11] fix lint: --- autonormalize/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autonormalize/version.py b/autonormalize/version.py index 5719d15..159d48b 100644 --- a/autonormalize/version.py +++ b/autonormalize/version.py @@ -1 +1 @@ -__version__ = "2.0.1" \ No newline at end of file +__version__ = "2.0.1" From 3e163b89a7a756fc86ad4a8d970a96461d6c35af Mon Sep 17 00:00:00 2001 From: Gaurav Sheni Date: Thu, 15 Dec 2022 11:24:08 -0500 Subject: [PATCH 11/11] fix entrypoint --- .gitignore | 1 + Makefile | 4 ++-- docs/source/guides/editing_dependencies.ipynb | 9 +++++++-- docs/source/guides/kaggle_food_dataset.ipynb | 9 +++++++-- docs/source/guides/kaggle_liquor_sales_dataset.ipynb | 2 +- pyproject.toml | 3 +++ 6 files changed, 21 insertions(+), 7 deletions(-) diff --git a/.gitignore b/.gitignore index cb363bf..a9648c5 100755 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ .DS_Store +unpacked_sdist/ # IDE .vscode diff --git a/Makefile b/Makefile index 3934d17..871fd64 100755 --- a/Makefile +++ b/Makefile @@ -22,11 +22,11 @@ lint-fix: isort autonormalize .PHONY: test -test: lint +test: pytest autonormalize/ -n auto .PHONY: testcoverage -testcoverage: lint +testcoverage: pytest autonormalize/ -n auto --cov=autonormalize .PHONY: installdeps diff --git a/docs/source/guides/editing_dependencies.ipynb b/docs/source/guides/editing_dependencies.ipynb index 6b355f6..83dd0f2 100644 --- a/docs/source/guides/editing_dependencies.ipynb +++ b/docs/source/guides/editing_dependencies.ipynb @@ -21,7 +21,7 @@ "outputs": [], "source": [ "import pandas as pd\n", - "from featuretools.autonormalize import autonormalize as an" + "import autonormalize as an" ] }, { @@ -104,7 +104,12 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.9" + "version": "3.11.0 (main, Dec 6 2022, 11:47:45) [Clang 14.0.0 (clang-1400.0.29.202)]" + }, + "vscode": { + "interpreter": { + "hash": "ad1ba34bd5a1885be81787fa03106b0bf16ccecb57fa7da0f03b17fcc99ca7e7" + } } }, "nbformat": 4, diff --git a/docs/source/guides/kaggle_food_dataset.ipynb b/docs/source/guides/kaggle_food_dataset.ipynb index edaae8d..d91fa18 100644 --- a/docs/source/guides/kaggle_food_dataset.ipynb +++ b/docs/source/guides/kaggle_food_dataset.ipynb @@ -26,7 +26,7 @@ "import pandas as pd\n", "from demo.food import load_sample\n", "\n", - "from featuretools.autonormalize import autonormalize as an" + "import autonormalize as an" ] }, { @@ -104,7 +104,12 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.11.0 (main, Dec 6 2022, 11:47:45) [Clang 14.0.0 (clang-1400.0.29.202)]" + }, + "vscode": { + "interpreter": { + "hash": "ad1ba34bd5a1885be81787fa03106b0bf16ccecb57fa7da0f03b17fcc99ca7e7" + } } }, "nbformat": 4, diff --git a/docs/source/guides/kaggle_liquor_sales_dataset.ipynb b/docs/source/guides/kaggle_liquor_sales_dataset.ipynb index 42de53f..ae191a0 100644 --- a/docs/source/guides/kaggle_liquor_sales_dataset.ipynb +++ b/docs/source/guides/kaggle_liquor_sales_dataset.ipynb @@ -26,7 +26,7 @@ "\n", "import pandas as pd\n", "from demo.liquor import load_sample\n", - "from featuretools.autonormalize import autonormalize as an" + "import autonormalize as an" ] }, { diff --git a/pyproject.toml b/pyproject.toml index 0108b8a..a07426d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,6 +42,9 @@ dependencies = [ "Twitter" = "https://twitter.com/alteryxoss" "Chat" = "https://join.slack.com/t/alteryx-oss/shared_invite/zt-182tyvuxv-NzIn6eiCEf8TBziuKp0bNA" +[project.entry-points."featuretools_plugin"] +autonormalize = "autonormalize" + [project.optional-dependencies] test = [ "pytest >= 5.2.0",