make release-tag: Merge branch 'main' into stable

sdv-dev · Mar 14, 2024 · f80b6e9 · f80b6e9
2 parents 3a9a307 + 844e223
commit f80b6e9
Show file tree

Hide file tree

Showing 19 changed files with 464 additions and 259 deletions.
diff --git a/.github/workflows/dependency_checker.yml b/.github/workflows/dependency_checker.yml
@@ -0,0 +1,29 @@
+name: Dependency Checker
+on:
+  schedule:
+    - cron: '0 0 * * 1-5'
+  workflow_dispatch:
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python 3.9
+      uses: actions/setup-python@v4
+      with:
+        python-version: 3.9
+    - name: Install dependencies
+      run: |
+        python -m pip install .[dev]
+        make check-deps OUTPUT_FILEPATH=latest_requirements.txt
+    - name: Create pull request
+      id: cpr
+      uses: peter-evans/create-pull-request@v4
+      with:
+        token: ${{ secrets.GH_ACCESS_TOKEN }}
+        commit-message: Update latest dependencies
+        title: Automated Latest Dependency Updates
+        body: "This is an auto-generated PR with **latest** dependency updates."
+        branch: latest-dependency-update
+        branch-suffix: short-commit-hash
+        base: main
diff --git a/.github/workflows/readme.yml b/.github/workflows/readme.yml
@@ -22,5 +22,7 @@ jobs:
       run: |
           python -m pip install --upgrade pip
           python -m pip install invoke rundoc .
+          python -m pip install tomli
+          python -m pip install packaging
     - name: Run the README.md
       run: invoke readme
diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst
@@ -219,9 +219,9 @@ This will perform the following actions:
 2. Bump the current version to the next release candidate, ``X.Y.Z.dev(N+1)``
 
 After this is done, the new pre-release can be installed by including the ``dev`` section in the
-dependency specification, either in ``setup.py``::
+dependency specification, either in ``pyproject.toml``::
 
-    install_requires = [
+    dependencies = [
         ...
         'ctgan>=X.Y.Z.dev',
         ...

diff --git a/HISTORY.md b/HISTORY.md
@@ -1,5 +1,36 @@
 # History
 
+## v0.9.1 - 2024-03-14
+
+This release changes the `loss_values` attribute of a CTGAN model to contain floats instead of `torch.Tensors`.
+
+### New Features
+
+* Return loss values as float values not PyTorch objects - Issue [#332](https://github.com/sdv-dev/CTGAN/issues/332) by @fealho
+
+### Maintenance
+
+* Transition from using setup.py to pyproject.toml to specify project metadata - Issue [#333](https://github.com/sdv-dev/CTGAN/issues/333) by @R-Palazzo
+* Remove bumpversion and use bump-my-version - Issue [#334](https://github.com/sdv-dev/CTGAN/issues/334) by @R-Palazzo
+* Add dependency checker - Issue [#336](https://github.com/sdv-dev/CTGAN/issues/336) by @amontanez24
+
+## v0.9.0 - 2024-02-13
+
+This release makes CTGAN sampling more efficient by saving the frequency of each categorical value.
+
+### New Features
+
+* Improve DataSampler efficiency - Issue [#327] ((https://github.com/sdv-dev/CTGAN/issue/327)) by @fealho
+
+## v0.8.0 - 2023-11-13
+
+This release adds a progress bar that will show when setting the `verbose` parameter to `True`
+when initializing `TVAE`.
+
+### New Features
+
+* Add verbosity TVAE (progress bar + save the loss values) - Issue [#300]((https://github.com/sdv-dev/CTGAN/issues/300) by @frances-h
+
 ## v0.7.5 - 2023-10-05
 
 This release adds a progress bar that will show when setting the `verbose` parameter to True when initializing `CTGAN`. It also removes a warning that was showing.

diff --git a/MANIFEST.in b/MANIFEST.in
diff --git a/Makefile b/Makefile
@@ -76,16 +76,9 @@ install-test: clean-build clean-pyc ## install the package and test dependencies
 install-develop: clean-build clean-pyc ## install the package in editable mode and dependencies for development
 	pip install -e .[dev]
 
-MINIMUM := $(shell sed -n '/install_requires = \[/,/]/p' setup.py | head -n-1 | tail -n+2 | sed 's/ *\(.*\),$?$$/\1/g' | tr '>' '=')
-
-.PHONY: install-minimum
-install-minimum: ## install the minimum supported versions of the package dependencies
-	pip install $(MINIMUM)
-
 
 # LINT TARGETS
 
-
 .PHONY: lint
 lint: ## check style with flake8 and isort
 	invoke lint
@@ -138,8 +131,7 @@ coverage: ## check code coverage quickly with the default Python
 
 .PHONY: dist
 dist: clean ## builds source and wheel package
-	python setup.py sdist
-	python setup.py bdist_wheel
+	python -m build --wheel --sdist
 	ls -l dist
 
 .PHONY: publish-confirm
@@ -161,34 +153,34 @@ publish: dist publish-confirm ## package and upload a release
 bumpversion-release: ## Merge main to stable and bumpversion release
 	git checkout stable || git checkout -b stable
 	git merge --no-ff main -m"make release-tag: Merge branch 'main' into stable"
-	bumpversion release
+	bump-my-version bump release
 	git push --tags origin stable
 
 .PHONY: bumpversion-release-test
 bumpversion-release-test: ## Merge main to stable and bumpversion release
 	git checkout stable || git checkout -b stable
 	git merge --no-ff main -m"make release-tag: Merge branch 'main' into stable"
-	bumpversion release --no-tag
+	bump-my-version bump release --no-tag
 	@echo git push --tags origin stable
 
 .PHONY: bumpversion-patch
 bumpversion-patch: ## Merge stable to main and bumpversion patch
 	git checkout main
 	git merge stable
-	bumpversion --no-tag patch
+	bump-my-version bump --no-tag patch
 	git push
 
 .PHONY: bumpversion-candidate
 bumpversion-candidate: ## Bump the version to the next candidate
-	bumpversion candidate --no-tag
+	bump-my-version bump candidate --no-tag
 
 .PHONY: bumpversion-minor
 bumpversion-minor: ## Bump the version the next minor skipping the release
-	bumpversion --no-tag minor
+	bump-my-version bump --no-tag minor
 
 .PHONY: bumpversion-major
 bumpversion-major: ## Bump the version the next major skipping the release
-	bumpversion --no-tag major
+	bump-my-version bump --no-tag major
 
 .PHONY: bumpversion-revert
 bumpversion-revert: ## Undo a previous bumpversion-release
@@ -238,3 +230,10 @@ release-minor: check-release bumpversion-minor release
 
 .PHONY: release-major
 release-major: check-release bumpversion-major release
+
+# Dependency targets
+
+.PHONY: check-deps
+check-deps:
+	$(eval allow_list='numpy=|pandas=|scikit-learn=|tqdm=|torch=|rdt=')
+	pip freeze | grep -v "CTGAN.git" | grep -E $(allow_list) > $(OUTPUT_FILEPATH)
diff --git a/ctgan/__init__.py b/ctgan/__init__.py
@@ -4,7 +4,7 @@
 
 __author__ = 'DataCebo, Inc.'
 __email__ = '[email protected]'
-__version__ = '0.7.5'
+__version__ = '0.9.1.dev1'
 
 from ctgan.demo import load_demo
 from ctgan.synthesizers.ctgan import CTGAN

diff --git a/ctgan/data_sampler.py b/ctgan/data_sampler.py
@@ -7,7 +7,7 @@ class DataSampler(object):
     """DataSampler samples the conditional vector and corresponding data for CTGAN."""
 
     def __init__(self, data, output_info, log_frequency):
-        self._data = data
+        self._data_length = len(data)
 
         def is_discrete_column(column_info):
             return (len(column_info) == 1
@@ -115,33 +115,34 @@ def sample_original_condvec(self, batch):
         if self._n_discrete_columns == 0:
             return None
 
+        category_freq = self._discrete_column_category_prob.flatten()
+        category_freq = category_freq[category_freq != 0]
+        category_freq = category_freq / np.sum(category_freq)
+        col_idxs = np.random.choice(np.arange(len(category_freq)), batch, p=category_freq)
         cond = np.zeros((batch, self._n_categories), dtype='float32')
-
-        for i in range(batch):
-            row_idx = np.random.randint(0, len(self._data))
-            col_idx = np.random.randint(0, self._n_discrete_columns)
-            matrix_st = self._discrete_column_matrix_st[col_idx]
-            matrix_ed = matrix_st + self._discrete_column_n_category[col_idx]
-            pick = np.argmax(self._data[row_idx, matrix_st:matrix_ed])
-            cond[i, pick + self._discrete_column_cond_st[col_idx]] = 1
+        cond[np.arange(batch), col_idxs] = 1
 
         return cond
 
-    def sample_data(self, n, col, opt):
+    def sample_data(self, data, n, col, opt):
         """Sample data from original training data satisfying the sampled conditional vector.
 
+        Args:
+            data:
+                The training data.
         Returns:
-            n rows of matrix data.
+            n:
+                n rows of matrix data.
         """
         if col is None:
-            idx = np.random.randint(len(self._data), size=n)
-            return self._data[idx]
+            idx = np.random.randint(len(data), size=n)
+            return data[idx]
 
         idx = []
         for c, o in zip(col, opt):
             idx.append(np.random.choice(self._rid_by_cat_cols[c][o]))
 
-        return self._data[idx]
+        return data[idx]
 
     def dim_cond_vec(self):
         """Return the total number of categories."""

diff --git a/ctgan/synthesizers/ctgan.py b/ctgan/synthesizers/ctgan.py
@@ -175,8 +175,7 @@ def __init__(self, embedding_dim=128, generator_dim=(256, 256), discriminator_di
         self._transformer = None
         self._data_sampler = None
         self._generator = None
-
-        self.loss_values = pd.DataFrame(columns=['Epoch', 'Generator Loss', 'Distriminator Loss'])
+        self.loss_values = None
 
     @staticmethod
     def _gumbel_softmax(logits, tau=1, hard=False, eps=1e-10, dim=-1):
@@ -355,7 +354,8 @@ def fit(self, train_data, discrete_columns=(), epochs=None):
                     condvec = self._data_sampler.sample_condvec(self._batch_size)
                     if condvec is None:
                         c1, m1, col, opt = None, None, None, None
-                        real = self._data_sampler.sample_data(self._batch_size, col, opt)
+                        real = self._data_sampler.sample_data(
+                            train_data, self._batch_size, col, opt)
                     else:
                         c1, m1, col, opt = condvec
                         c1 = torch.from_numpy(c1).to(self._device)
@@ -365,7 +365,7 @@ def fit(self, train_data, discrete_columns=(), epochs=None):
                         perm = np.arange(self._batch_size)
                         np.random.shuffle(perm)
                         real = self._data_sampler.sample_data(
-                            self._batch_size, col[perm], opt[perm])
+                            train_data, self._batch_size, col[perm], opt[perm])
                         c2 = c1[perm]
 
                     fake = self._generator(fakez)
@@ -422,8 +422,8 @@ def fit(self, train_data, discrete_columns=(), epochs=None):
                 loss_g.backward()
                 optimizerG.step()
 
-            generator_loss = loss_g.detach().cpu()
-            discriminator_loss = loss_d.detach().cpu()
+            generator_loss = loss_g.detach().cpu().item()
+            discriminator_loss = loss_d.detach().cpu().item()
 
             epoch_loss_df = pd.DataFrame({
                 'Epoch': [i],

diff --git a/ctgan/synthesizers/tvae.py b/ctgan/synthesizers/tvae.py
@@ -1,11 +1,13 @@
 """TVAE module."""
 
 import numpy as np
+import pandas as pd
 import torch
 from torch.nn import Linear, Module, Parameter, ReLU, Sequential
 from torch.nn.functional import cross_entropy
 from torch.optim import Adam
 from torch.utils.data import DataLoader, TensorDataset
+from tqdm import tqdm
 
 from ctgan.data_transformer import DataTransformer
 from ctgan.synthesizers.base import BaseSynthesizer, random_state
@@ -112,7 +114,8 @@ def __init__(
         batch_size=500,
         epochs=300,
         loss_factor=2,
-        cuda=True
+        cuda=True,
+        verbose=False
     ):
 
         self.embedding_dim = embedding_dim
@@ -123,6 +126,8 @@ def __init__(
         self.batch_size = batch_size
         self.loss_factor = loss_factor
         self.epochs = epochs
+        self.loss_values = pd.DataFrame(columns=['Epoch', 'Batch', 'Loss'])
+        self.verbose = verbose
 
         if not cuda or not torch.cuda.is_available():
             device = 'cpu'
@@ -159,7 +164,15 @@ def fit(self, train_data, discrete_columns=()):
             list(encoder.parameters()) + list(self.decoder.parameters()),
             weight_decay=self.l2scale)
 
-        for i in range(self.epochs):
+        self.loss_values = pd.DataFrame(columns=['Epoch', 'Batch', 'Loss'])
+        iterator = tqdm(range(self.epochs), disable=(not self.verbose))
+        if self.verbose:
+            iterator_description = 'Loss: {loss:.3f}'
+            iterator.set_description(iterator_description.format(loss=0))
+
+        for i in iterator:
+            loss_values = []
+            batch = []
             for id_, data in enumerate(loader):
                 optimizerAE.zero_grad()
                 real = data[0].to(self._device)
@@ -176,6 +189,26 @@ def fit(self, train_data, discrete_columns=()):
                 optimizerAE.step()
                 self.decoder.sigma.data.clamp_(0.01, 1.0)
 
+                batch.append(id_)
+                loss_values.append(loss.detach().cpu().item())
+
+            epoch_loss_df = pd.DataFrame({
+                'Epoch': [i] * len(batch),
+                'Batch': batch,
+                'Loss': loss_values
+            })
+            if not self.loss_values.empty:
+                self.loss_values = pd.concat(
+                    [self.loss_values, epoch_loss_df]
+                ).reset_index(drop=True)
+            else:
+                self.loss_values = epoch_loss_df
+
+            if self.verbose:
+                iterator.set_description(
+                    iterator_description.format(
+                        loss=loss.detach().cpu().item()))
+
     @random_state
     def sample(self, samples):
         """Sample data similar to the training data.

diff --git a/latest_requirements.txt b/latest_requirements.txt
@@ -0,0 +1,6 @@
+numpy==1.26.4
+pandas==2.2.1
+rdt==1.10.0
+scikit-learn==1.4.1.post1
+torch==2.2.1
+tqdm==4.66.2