From 0b6e6c7ea5209a26e44c403efda4d38905c0c516 Mon Sep 17 00:00:00 2001 From: shaikh-ma <88078876+shaikh-ma@users.noreply.github.com> Date: Tue, 26 Nov 2024 16:54:51 +0530 Subject: [PATCH 01/14] Updating fork method in datasets.py --- scrunch/datasets.py | 58 ++++++++++++++++++++++++++++----------------- 1 file changed, 36 insertions(+), 22 deletions(-) diff --git a/scrunch/datasets.py b/scrunch/datasets.py index 8ab1175..3ce50e4 100644 --- a/scrunch/datasets.py +++ b/scrunch/datasets.py @@ -2351,8 +2351,15 @@ def add_deck(self, name, description="", public=False): new_deck = self.resource.decks.create(payload) return self.decks[new_deck.self.split('/')[-2]] - def fork(self, description=None, name=None, is_published=False, - preserve_owner=True, project=None, **kwargs): + def fork( + self, + description=None, + name=None, + is_published=False, + preserve_owner=True, + project=None, + **kwargs + ): """ Create a fork of ds and add virgin savepoint. @@ -2371,15 +2378,23 @@ def fork(self, description=None, name=None, is_published=False, dataset otherwise the owner will be the current user in the session and the Dataset will be set under `Personal Project`. :param project: str, default=None - The project ID or URL for the project in which the fork - dataset should be created. + The project ID or URL for the project in which the fork + dataset should be created. :returns _fork: scrunch.datasets.BaseDataset """ + from scrunch.mutable_dataset import MutableDataset description = description or self.resource.body.description + LOG.warning( + "The preserve_owner parameter will be removed soon" + " in favour of providing a project or not " + "(in which case the behavior will be as it is currently" + " if preserve_owner=True and project=None).", + ) + if name is None: nforks = len(self.resource.forks.index) dsname = self.resource.body.name @@ -2387,13 +2402,7 @@ def fork(self, description=None, name=None, is_published=False, dsname = dsname.encode("ascii", "ignore") name = "FORK #{} of {}".format(nforks + 1, dsname) - body = dict( - name=name, - description=description, - is_published=is_published, - **kwargs - ) - + body = dict(name=name, description=description, is_published=is_published, **kwargs) # Handling project vs owner conflict owner = kwargs.get("owner") @@ -2401,22 +2410,27 @@ def fork(self, description=None, name=None, is_published=False, raise ValueError( "Cannot pass both 'project' & 'owner' parameters together. " "Please try again by passing only 'project' parameter." - ) + ) elif owner: project = owner - + # Setting project value based on preserve_owner. - if preserve_owner and project: - raise ValueError("Cannot pass 'project' or 'owner' when preserve_owner=True") - elif preserve_owner: - body["owner"] = self.resource.body.owner - elif project: - body["owner"] = ( - project if project.startswith("http") else get_project(project).url - ) + if preserve_owner: + if project: + raise ValueError( + "Cannot pass 'project' or 'owner' when preserve_owner=True." + ) + else: + if project: + body["owner"] = ( + project if project.startswith("http") else get_project(project).url + ) + else: + raise ValueError( + "Project parameter should be provided when preserve_owner=False." + ) payload = shoji_entity_wrapper(body) - _fork = self.resource.forks.create(payload).refresh() return MutableDataset(_fork) From 91fc740516d96f077fcce416c367d992fcd551c2 Mon Sep 17 00:00:00 2001 From: shaikh-ma <88078876+shaikh-ma@users.noreply.github.com> Date: Tue, 26 Nov 2024 16:57:05 +0530 Subject: [PATCH 02/14] Update test_datasets.py --- scrunch/tests/test_datasets.py | 51 +++++++++++++++++++++------------- 1 file changed, 32 insertions(+), 19 deletions(-) diff --git a/scrunch/tests/test_datasets.py b/scrunch/tests/test_datasets.py index 63782aa..8b2dd36 100644 --- a/scrunch/tests/test_datasets.py +++ b/scrunch/tests/test_datasets.py @@ -1752,6 +1752,7 @@ class TestForks(TestCase): def test_fork(self): user_url = "some_user_url" + sess = MagicMock() body = JSONObject({ 'name': 'ds name', @@ -1769,7 +1770,8 @@ def _create(*args): ds_res.forks = MagicMock() ds_res.forks.create.side_effect = _create ds = StreamingDataset(ds_res) - forked_ds = ds.fork(preserve_owner=False) + forked_ds = ds.fork(preserve_owner=True) + assert isinstance(forked_ds, MutableDataset) ds_res.forks.create.assert_called_with(as_entity({ 'name': 'FORK #1 of ds name', @@ -1816,38 +1818,49 @@ def test_fork_project(self): with pytest.raises(ValueError, match=err_msg1): ds.fork(owner="ABCD", project="1234") - err_msg2 = ("Cannot pass 'project' or 'owner' when preserve_owner=True") + err_msg2 = ("Cannot pass 'project' or 'owner' when preserve_owner=True.") with pytest.raises(ValueError, match=err_msg2): ds.fork(preserve_owner=True, project="1234") with pytest.raises(ValueError, match=err_msg2): - ds.fork(preserve_owner=True, owner="1234") - + ds.fork(preserve_owner=True, owner="1234") def test_fork_preserve_owner(self): - project = 'http://test.crunch.io/api/projects/123/' + project = "http://test.crunch.io/api/projects/123/" sess = MagicMock() - body = JSONObject({ - 'name': 'ds name', - 'description': 'ds description', - 'owner': project, - }) + body = JSONObject( + { + "name": "ds name", + "description": "ds description", + "owner": project, + } + ) ds_res = MagicMock(session=sess, body=body) ds_res.project.self = project ds_res.forks = MagicMock() ds_res.forks.index = {} ds = BaseDataset(ds_res) - ds.fork(preserve_owner=True) - ds_res.forks.create.assert_called_with({ - 'element': 'shoji:entity', - 'body': { - 'name': 'FORK #1 of ds name', - 'description': 'ds description', - 'owner': project, # Project preserved - 'is_published': False, + + # Test validations + with pytest.raises( + ValueError, + match="Project parameter should be provided when preserve_owner=False.", + ): + ds.fork(preserve_owner=False, project=None) + + ds.fork(preserve_owner=False, owner=project) + ds_res.forks.create.assert_called_with( + { + "element": "shoji:entity", + "body": { + "name": "FORK #1 of ds name", + "description": "ds description", + "owner": project, # Project preserved + "is_published": False, + }, } - }) + ) def test_delete_forks(self): f1 = MagicMock() From 84853ed675cab7eeac2174d2c4e24582364e0f38 Mon Sep 17 00:00:00 2001 From: shaikh-ma <88078876+shaikh-ma@users.noreply.github.com> Date: Wed, 4 Dec 2024 17:45:28 +0530 Subject: [PATCH 03/14] update --- scrunch/datasets.py | 2 ++ scrunch/tests/test_datasets.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/scrunch/datasets.py b/scrunch/datasets.py index 3ce50e4..3db5758 100644 --- a/scrunch/datasets.py +++ b/scrunch/datasets.py @@ -2420,6 +2420,8 @@ def fork( raise ValueError( "Cannot pass 'project' or 'owner' when preserve_owner=True." ) + else: + body["owner"] = self.resource.body.owner else: if project: body["owner"] = ( diff --git a/scrunch/tests/test_datasets.py b/scrunch/tests/test_datasets.py index 8b2dd36..fcd2b60 100644 --- a/scrunch/tests/test_datasets.py +++ b/scrunch/tests/test_datasets.py @@ -1777,6 +1777,8 @@ def _create(*args): 'name': 'FORK #1 of ds name', 'description': 'ds description', 'is_published': False, + 'owner': 'http://test.crunch.io/api/users/123/', + })) From 455bf86643d193cf0f63d17ed000b4b6cd3a1e9d Mon Sep 17 00:00:00 2001 From: shaikh-ma <88078876+shaikh-ma@users.noreply.github.com> Date: Wed, 4 Dec 2024 17:50:14 +0530 Subject: [PATCH 04/14] Adding in comments for clarification --- scrunch/datasets.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scrunch/datasets.py b/scrunch/datasets.py index 3db5758..e2e2585 100644 --- a/scrunch/datasets.py +++ b/scrunch/datasets.py @@ -2421,9 +2421,11 @@ def fork( "Cannot pass 'project' or 'owner' when preserve_owner=True." ) else: + # Create fork in source dataset path. body["owner"] = self.resource.body.owner else: if project: + # Create fork in given Project path. body["owner"] = ( project if project.startswith("http") else get_project(project).url ) From be95ab6de0bd5d7e9dc0ade6a06bd058b513d858 Mon Sep 17 00:00:00 2001 From: shaikh-ma <88078876+shaikh-ma@users.noreply.github.com> Date: Wed, 4 Dec 2024 17:54:29 +0530 Subject: [PATCH 05/14] updates --- scrunch/datasets.py | 11 +++++++++-- scrunch/tests/test_datasets.py | 1 - 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/scrunch/datasets.py b/scrunch/datasets.py index e2e2585..3779a9b 100644 --- a/scrunch/datasets.py +++ b/scrunch/datasets.py @@ -2402,7 +2402,13 @@ def fork( dsname = dsname.encode("ascii", "ignore") name = "FORK #{} of {}".format(nforks + 1, dsname) - body = dict(name=name, description=description, is_published=is_published, **kwargs) + body = dict( + name=name, + description=description, + is_published=is_published, + **kwargs + ) + # Handling project vs owner conflict owner = kwargs.get("owner") @@ -2410,7 +2416,7 @@ def fork( raise ValueError( "Cannot pass both 'project' & 'owner' parameters together. " "Please try again by passing only 'project' parameter." - ) + ) elif owner: project = owner @@ -2435,6 +2441,7 @@ def fork( ) payload = shoji_entity_wrapper(body) + _fork = self.resource.forks.create(payload).refresh() return MutableDataset(_fork) diff --git a/scrunch/tests/test_datasets.py b/scrunch/tests/test_datasets.py index fcd2b60..24821cb 100644 --- a/scrunch/tests/test_datasets.py +++ b/scrunch/tests/test_datasets.py @@ -1771,7 +1771,6 @@ def _create(*args): ds_res.forks.create.side_effect = _create ds = StreamingDataset(ds_res) forked_ds = ds.fork(preserve_owner=True) - assert isinstance(forked_ds, MutableDataset) ds_res.forks.create.assert_called_with(as_entity({ 'name': 'FORK #1 of ds name', From 2a74a050f88a062e8a58eb742890a1579875e81c Mon Sep 17 00:00:00 2001 From: shaikh-ma <88078876+shaikh-ma@users.noreply.github.com> Date: Wed, 4 Dec 2024 17:59:26 +0530 Subject: [PATCH 06/14] minimize differences --- scrunch/datasets.py | 11 ++--------- scrunch/tests/test_datasets.py | 1 - 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/scrunch/datasets.py b/scrunch/datasets.py index 3779a9b..4281d6a 100644 --- a/scrunch/datasets.py +++ b/scrunch/datasets.py @@ -2351,15 +2351,8 @@ def add_deck(self, name, description="", public=False): new_deck = self.resource.decks.create(payload) return self.decks[new_deck.self.split('/')[-2]] - def fork( - self, - description=None, - name=None, - is_published=False, - preserve_owner=True, - project=None, - **kwargs - ): + def fork(self, description=None, name=None, is_published=False, + preserve_owner=True, project=None, **kwargs): """ Create a fork of ds and add virgin savepoint. diff --git a/scrunch/tests/test_datasets.py b/scrunch/tests/test_datasets.py index 24821cb..9edb87b 100644 --- a/scrunch/tests/test_datasets.py +++ b/scrunch/tests/test_datasets.py @@ -1752,7 +1752,6 @@ class TestForks(TestCase): def test_fork(self): user_url = "some_user_url" - sess = MagicMock() body = JSONObject({ 'name': 'ds name', From 3ce3b6d1eaddeaea12ee4079ca03466301ecfcf9 Mon Sep 17 00:00:00 2001 From: shaikh-ma <88078876+shaikh-ma@users.noreply.github.com> Date: Thu, 12 Dec 2024 11:38:01 +0530 Subject: [PATCH 07/14] Updates as per code review feedback --- scrunch/datasets.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scrunch/datasets.py b/scrunch/datasets.py index 4281d6a..fa60a75 100644 --- a/scrunch/datasets.py +++ b/scrunch/datasets.py @@ -2373,6 +2373,8 @@ def fork(self, description=None, name=None, is_published=False, :param project: str, default=None The project ID or URL for the project in which the fork dataset should be created. + If project=None, the fork dataset will be created in the same + location as the source dataset. :returns _fork: scrunch.datasets.BaseDataset """ From b994da954000c1de62933ffc1459c29b3a73eca6 Mon Sep 17 00:00:00 2001 From: shaikh-ma <88078876+shaikh-ma@users.noreply.github.com> Date: Thu, 12 Dec 2024 11:50:33 +0530 Subject: [PATCH 08/14] updating docstring for preserve_owner parameter updated behavior --- scrunch/datasets.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/scrunch/datasets.py b/scrunch/datasets.py index fa60a75..211cf20 100644 --- a/scrunch/datasets.py +++ b/scrunch/datasets.py @@ -2367,12 +2367,13 @@ def fork(self, description=None, name=None, is_published=False, If True, the fork will be visible to viewers of ds. If False it will only be viewable to editors of ds. :param preserve_owner: bool, default=True - If True, the owner of the fork will be the same as the parent - dataset otherwise the owner will be the current user in the - session and the Dataset will be set under `Personal Project`. + If preserve_owner=True and project=None, the fork dataset will + be created in the same location as source dataset. + If preserve_owner=False and project is passed, the fork dataset + will be created in the given project location. :param project: str, default=None - The project ID or URL for the project in which the fork - dataset should be created. + The project ID or URL for the project in which the fork dataset + should be created. If project=None, the fork dataset will be created in the same location as the source dataset. From 4c66cea13abd3dfa5027f238fd159d2b63638b15 Mon Sep 17 00:00:00 2001 From: shaikh-ma <88078876+shaikh-ma@users.noreply.github.com> Date: Fri, 13 Dec 2024 00:32:27 +0530 Subject: [PATCH 09/14] Updates as per code review --- scrunch/datasets.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scrunch/datasets.py b/scrunch/datasets.py index 211cf20..bf23bae 100644 --- a/scrunch/datasets.py +++ b/scrunch/datasets.py @@ -2424,11 +2424,11 @@ def fork(self, description=None, name=None, is_published=False, ) else: # Create fork in source dataset path. - body["owner"] = self.resource.body.owner + body["project"] = self.resource.body.owner else: if project: # Create fork in given Project path. - body["owner"] = ( + body["project"] = ( project if project.startswith("http") else get_project(project).url ) else: From e2d0ef287f52a14f5f883abefb390930f2c6621d Mon Sep 17 00:00:00 2001 From: shaikh-ma <88078876+shaikh-ma@users.noreply.github.com> Date: Fri, 13 Dec 2024 00:39:09 +0530 Subject: [PATCH 10/14] Update test_datasets.py --- scrunch/tests/test_datasets.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scrunch/tests/test_datasets.py b/scrunch/tests/test_datasets.py index 9edb87b..3d7fe98 100644 --- a/scrunch/tests/test_datasets.py +++ b/scrunch/tests/test_datasets.py @@ -1775,7 +1775,7 @@ def _create(*args): 'name': 'FORK #1 of ds name', 'description': 'ds description', 'is_published': False, - 'owner': 'http://test.crunch.io/api/users/123/', + 'project': 'http://test.crunch.io/api/users/123/', })) @@ -1798,7 +1798,7 @@ def test_fork_project(self): 'body': { 'name': 'FORK #1 of ds name', 'description': 'ds description', - 'owner': project, # Project added + 'project': project, # Project added 'is_published': False, } } @@ -1856,7 +1856,7 @@ def test_fork_preserve_owner(self): "body": { "name": "FORK #1 of ds name", "description": "ds description", - "owner": project, # Project preserved + "project": project, # Project preserved "is_published": False, }, } From 930905a298dc92b9f02497b59f1da1de508dd934 Mon Sep 17 00:00:00 2001 From: shaikh-ma <88078876+shaikh-ma@users.noreply.github.com> Date: Fri, 13 Dec 2024 00:51:21 +0530 Subject: [PATCH 11/14] fix tests --- scrunch/tests/test_datasets.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scrunch/tests/test_datasets.py b/scrunch/tests/test_datasets.py index 3d7fe98..12d7469 100644 --- a/scrunch/tests/test_datasets.py +++ b/scrunch/tests/test_datasets.py @@ -1756,7 +1756,7 @@ def test_fork(self): body = JSONObject({ 'name': 'ds name', 'description': 'ds description', - 'owner': 'http://test.crunch.io/api/users/123/', + 'project': 'http://test.crunch.io/api/users/123/', 'streaming': 'yes', }) fork_res = MagicMock() @@ -1786,7 +1786,7 @@ def test_fork_project(self): body = JSONObject({ 'name': 'ds name', 'description': 'ds description', - 'owner': project, + 'project': project, }) ds_res = MagicMock(session=sess, body=body) ds_res.forks = MagicMock() @@ -1833,7 +1833,7 @@ def test_fork_preserve_owner(self): { "name": "ds name", "description": "ds description", - "owner": project, + "project": project, } ) ds_res = MagicMock(session=sess, body=body) From d616aa0fa65b336c31b6471e0b4d2113c8f8352d Mon Sep 17 00:00:00 2001 From: shaikh-ma <88078876+shaikh-ma@users.noreply.github.com> Date: Fri, 13 Dec 2024 13:12:05 +0530 Subject: [PATCH 12/14] fix --- scrunch/datasets.py | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/scrunch/datasets.py b/scrunch/datasets.py index bf23bae..a18c85d 100644 --- a/scrunch/datasets.py +++ b/scrunch/datasets.py @@ -2379,10 +2379,19 @@ def fork(self, description=None, name=None, is_published=False, :returns _fork: scrunch.datasets.BaseDataset """ - from scrunch.mutable_dataset import MutableDataset + + # Handling project vs owner conflict + owner = kwargs.get("owner") - description = description or self.resource.body.description + if project and owner: + raise ValueError( + "Cannot pass both 'project' & 'owner' parameters together. " + "Please try again by passing only 'project' parameter." + ) + elif owner: + project = owner + del kwargs["owner"] LOG.warning( "The preserve_owner parameter will be removed soon" @@ -2400,22 +2409,10 @@ def fork(self, description=None, name=None, is_published=False, body = dict( name=name, - description=description, + description=description or self.resource.body.description, is_published=is_published, **kwargs ) - - # Handling project vs owner conflict - owner = kwargs.get("owner") - - if project and owner: - raise ValueError( - "Cannot pass both 'project' & 'owner' parameters together. " - "Please try again by passing only 'project' parameter." - ) - elif owner: - project = owner - # Setting project value based on preserve_owner. if preserve_owner: if project: @@ -2424,7 +2421,7 @@ def fork(self, description=None, name=None, is_published=False, ) else: # Create fork in source dataset path. - body["project"] = self.resource.body.owner + body["project"] = self.resource.body.owner else: if project: # Create fork in given Project path. @@ -3510,5 +3507,5 @@ def execute(self, csv_file): if folder_name in folders_by_name: folders_by_name[folder_name].entity.delete() # Always delete the tmp dataset no matter what - tmp_ds.delete() + tmp_ds.delete) From 7592e214545cfae03709bc724740c909acba4c03 Mon Sep 17 00:00:00 2001 From: shaikh-ma <88078876+shaikh-ma@users.noreply.github.com> Date: Fri, 13 Dec 2024 13:15:48 +0530 Subject: [PATCH 13/14] fix tests --- scrunch/tests/test_datasets.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scrunch/tests/test_datasets.py b/scrunch/tests/test_datasets.py index 12d7469..3d7fe98 100644 --- a/scrunch/tests/test_datasets.py +++ b/scrunch/tests/test_datasets.py @@ -1756,7 +1756,7 @@ def test_fork(self): body = JSONObject({ 'name': 'ds name', 'description': 'ds description', - 'project': 'http://test.crunch.io/api/users/123/', + 'owner': 'http://test.crunch.io/api/users/123/', 'streaming': 'yes', }) fork_res = MagicMock() @@ -1786,7 +1786,7 @@ def test_fork_project(self): body = JSONObject({ 'name': 'ds name', 'description': 'ds description', - 'project': project, + 'owner': project, }) ds_res = MagicMock(session=sess, body=body) ds_res.forks = MagicMock() @@ -1833,7 +1833,7 @@ def test_fork_preserve_owner(self): { "name": "ds name", "description": "ds description", - "project": project, + "owner": project, } ) ds_res = MagicMock(session=sess, body=body) From f139cb6cf780aca13fcfdb52a720e4f4effc2a2c Mon Sep 17 00:00:00 2001 From: shaikh-ma <88078876+shaikh-ma@users.noreply.github.com> Date: Fri, 13 Dec 2024 13:18:31 +0530 Subject: [PATCH 14/14] fix typo --- scrunch/datasets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrunch/datasets.py b/scrunch/datasets.py index a18c85d..11ad2c9 100644 --- a/scrunch/datasets.py +++ b/scrunch/datasets.py @@ -3507,5 +3507,5 @@ def execute(self, csv_file): if folder_name in folders_by_name: folders_by_name[folder_name].entity.delete() # Always delete the tmp dataset no matter what - tmp_ds.delete) + tmp_ds.delete()