diff --git a/Doc/library/dbm.rst b/Doc/library/dbm.rst index 36221c026d6d4b..faf58a96cef461 100644 --- a/Doc/library/dbm.rst +++ b/Doc/library/dbm.rst @@ -15,10 +15,16 @@ * :mod:`dbm.ndbm` If none of these modules are installed, the -slow-but-simple implementation in module :mod:`dbm.dumb` will be used. There +slow-but-simple implementation in module :mod:`dbm.dumb` will be used. There is a `third party interface `_ to the Oracle Berkeley DB. +.. note:: + None of the underlying modules will automatically shrink the disk space used by + the database file. However, :mod:`dbm.sqlite3`, :mod:`dbm.gnu` and :mod:`dbm.dumb` + provide a :meth:`!reorganize` method that can be used for this purpose. + + .. exception:: error A tuple containing the exceptions that can be raised by each of the supported @@ -186,6 +192,16 @@ or any other SQLite browser, including the SQLite CLI. The Unix file access mode of the file (default: octal ``0o666``), used only when the database has to be created. + .. method:: sqlite3.reorganize() + + If you have carried out a lot of deletions and would like to shrink the space + used on disk, this method will reorganize the database; otherwise, deleted file + space will be kept and reused as new (key, value) pairs are added. + + .. note:: + While reorganizing, as much as two times the size of the original database is required + in free disk space. However, be aware that this factor changes for each :mod:`dbm` submodule. + :mod:`dbm.gnu` --- GNU database manager --------------------------------------- @@ -284,6 +300,10 @@ functionality like crash tolerance. reorganization; otherwise, deleted file space will be kept and reused as new (key, value) pairs are added. + .. note:: + While reorganizing, as much as one time the size of the original database is required + in free disk space. However, be aware that this factor changes for each :mod:`dbm` submodule. + .. method:: gdbm.sync() When the database has been opened in fast mode, this method forces any @@ -438,6 +458,11 @@ The :mod:`!dbm.dumb` module defines the following: with a sufficiently large/complex entry due to stack depth limitations in Python's AST compiler. + .. warning:: + :mod:`dbm.dumb` does not support concurrent read/write access. (Multiple + simultaneous read accesses are safe.) When a program has the database open + for writing, no other program should have it open for reading or writing. + .. versionchanged:: 3.5 :func:`~dbm.dumb.open` always creates a new database when *flag* is ``'n'``. @@ -460,3 +485,13 @@ The :mod:`!dbm.dumb` module defines the following: .. method:: dumbdbm.close() Close the database. + + .. method:: dumbdbm.reorganize() + + If you have carried out a lot of deletions and would like to shrink the space + used on disk, this method will reorganize the database; otherwise, deleted file + space will not be reused. + + .. note:: + While reorganizing, no additional free disk space is required. However, be aware + that this factor changes for each :mod:`dbm` submodule. diff --git a/Doc/library/shelve.rst b/Doc/library/shelve.rst index 6e74a59b82b8ec..2a48815c4e0b64 100644 --- a/Doc/library/shelve.rst +++ b/Doc/library/shelve.rst @@ -75,8 +75,13 @@ Two additional methods are supported: Write back all entries in the cache if the shelf was opened with *writeback* set to :const:`True`. Also empty the cache and synchronize the persistent - dictionary on disk, if feasible. This is called automatically when the shelf - is closed with :meth:`close`. + dictionary on disk, if feasible. This is called automatically when + :meth:`reorganize` is called or the shelf is closed with :meth:`close`. + +.. method:: Shelf.reorganize() + + Calls :meth:`sync` and attempts to shrink space used on disk by removing empty + space resulting from deletions. .. method:: Shelf.close() @@ -116,6 +121,11 @@ Restrictions * On macOS :mod:`dbm.ndbm` can silently corrupt the database file on updates, which can cause hard crashes when trying to read from the database. +* :meth:`Shelf.reorganize` may not be available for all database packages and + may temporarely increase resource usage (especially disk space) when called. + Additionally, it will never run automatically and instead needs to be called + explicitly. + .. class:: Shelf(dict, protocol=None, writeback=False, keyencoding='utf-8') diff --git a/Lib/dbm/dumb.py b/Lib/dbm/dumb.py index def120ffc3778b..1bc239a84fff83 100644 --- a/Lib/dbm/dumb.py +++ b/Lib/dbm/dumb.py @@ -9,7 +9,7 @@ - seems to contain a bug when updating... - reclaim free space (currently, space once occupied by deleted or expanded -items is never reused) +items is not reused exept if .reorganize() is called) - support concurrent access (currently, if two processes take turns making updates, they can mess up the index) @@ -17,8 +17,6 @@ - support efficient access to large databases (currently, the whole index is read when the database is opened, and some updates rewrite the whole index) -- support opening for read-only (flag = 'm') - """ import ast as _ast @@ -289,6 +287,34 @@ def __enter__(self): def __exit__(self, *args): self.close() + def reorganize(self): + if self._readonly: + raise error('The database is opened for reading only') + self._verify_open() + # Ensure all changes are committed before reorganizing. + self._commit() + # Open file in r+ to allow changing in-place. + with _io.open(self._datfile, 'rb+') as f: + reorganize_pos = 0 + + # Iterate over existing keys, sorted by starting byte. + for key in sorted(self._index, key = lambda k: self._index[k][0]): + pos, siz = self._index[key] + f.seek(pos) + val = f.read(siz) + + f.seek(reorganize_pos) + f.write(val) + self._index[key] = (reorganize_pos, siz) + + blocks_occupied = (siz + _BLOCKSIZE - 1) // _BLOCKSIZE + reorganize_pos += blocks_occupied * _BLOCKSIZE + + f.truncate(reorganize_pos) + # Commit changes to index, which were not in-place. + self._commit() + + def open(file, flag='c', mode=0o666): """Open the database file, filename, and return corresponding object. diff --git a/Lib/dbm/sqlite3.py b/Lib/dbm/sqlite3.py index 7e0ae2a29e3a64..b296a1bcd1bbfa 100644 --- a/Lib/dbm/sqlite3.py +++ b/Lib/dbm/sqlite3.py @@ -15,6 +15,7 @@ STORE_KV = "REPLACE INTO Dict (key, value) VALUES (CAST(? AS BLOB), CAST(? AS BLOB))" DELETE_KEY = "DELETE FROM Dict WHERE key = CAST(? AS BLOB)" ITER_KEYS = "SELECT key FROM Dict" +REORGANIZE = "VACUUM" class error(OSError): @@ -122,6 +123,9 @@ def __enter__(self): def __exit__(self, *args): self.close() + def reorganize(self): + self._execute(REORGANIZE) + def open(filename, /, flag="r", mode=0o666): """Open a dbm.sqlite3 database and return the dbm object. diff --git a/Lib/shelve.py b/Lib/shelve.py index 50584716e9ea64..b53dc8b7a8ece9 100644 --- a/Lib/shelve.py +++ b/Lib/shelve.py @@ -171,6 +171,11 @@ def sync(self): if hasattr(self.dict, 'sync'): self.dict.sync() + def reorganize(self): + self.sync() + if hasattr(self.dict, 'reorganize'): + self.dict.reorganize() + class BsdDbShelf(Shelf): """Shelf implementation using the "BSD" db interface. diff --git a/Lib/test/test_dbm.py b/Lib/test/test_dbm.py index 4be7c5649da68a..7c4fbfa5456d0f 100644 --- a/Lib/test/test_dbm.py +++ b/Lib/test/test_dbm.py @@ -135,6 +135,67 @@ def test_anydbm_access(self): assert(f[key] == b"Python:") f.close() + def test_anydbm_readonly_reorganize(self): + self.init_db() + with dbm.open(_fname, 'r') as d: + # Early stopping. + if not hasattr(d, 'reorganize'): + return + + self.assertRaises(dbm.error, lambda: d.reorganize()) + + def test_anydbm_reorganize_not_changed_content(self): + self.init_db() + with dbm.open(_fname, 'c') as d: + # Early stopping. + if not hasattr(d, 'reorganize'): + return + + keys_before = sorted(d.keys()) + values_before = [d[k] for k in keys_before] + d.reorganize() + keys_after = sorted(d.keys()) + values_after = [d[k] for k in keys_before] + self.assertEqual(keys_before, keys_after) + self.assertEqual(values_before, values_after) + + def test_anydbm_reorganize_decreased_size(self): + + def _calculate_db_size(db_path): + if os.path.isfile(db_path): + return os.path.getsize(db_path) + total_size = 0 + for root, _, filenames in os.walk(db_path): + for filename in filenames: + file_path = os.path.join(root, filename) + total_size += os.path.getsize(file_path) + return total_size + + # This test requires relatively large databases to reliably show difference in size before and after reorganizing. + with dbm.open(_fname, 'n') as f: + # Early stopping. + if not hasattr(f, 'reorganize'): + return + + for k in self._dict: + f[k.encode('ascii')] = self._dict[k] * 100000 + db_keys = list(f.keys()) + + # Make sure to calculate size of database only after file is closed to ensure file content are flushed to disk. + size_before = _calculate_db_size(os.path.dirname(_fname)) + + # Delete some elements from the start of the database. + keys_to_delete = db_keys[:len(db_keys) // 2] + with dbm.open(_fname, 'c') as f: + for k in keys_to_delete: + del f[k] + f.reorganize() + + # Make sure to calculate size of database only after file is closed to ensure file content are flushed to disk. + size_after = _calculate_db_size(os.path.dirname(_fname)) + + self.assertLess(size_after, size_before) + def test_open_with_bytes(self): dbm.open(os.fsencode(_fname), "c").close() diff --git a/Misc/ACKS b/Misc/ACKS index 610dcf9f4238de..210b25a8503301 100644 --- a/Misc/ACKS +++ b/Misc/ACKS @@ -1362,6 +1362,7 @@ Milan Oberkirch Pascal Oberndoerfer Géry Ogam Seonkyo Ok +Andrea Oliveri Jeffrey Ollie Adam Olsen Bryan Olson diff --git a/Misc/NEWS.d/next/Library/2025-05-15-00-27-09.gh-issue-134004.e8k4-R.rst b/Misc/NEWS.d/next/Library/2025-05-15-00-27-09.gh-issue-134004.e8k4-R.rst new file mode 100644 index 00000000000000..a9a56d9239b305 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-05-15-00-27-09.gh-issue-134004.e8k4-R.rst @@ -0,0 +1,2 @@ +:mod:`shelve` as well as underlying :mod:`!dbm.dumb` and :mod:`!dbm.sqlite` now have :meth:`!reorganize` methods to +recover unused free space previously occupied by deleted entries.