merge master

4dn-dcic · Aug 14, 2024 · 8ecbe8e · 8ecbe8e
2 parents d190218 + 8e8b126
commit 8ecbe8e
Show file tree

Hide file tree

Showing 23 changed files with 1,009 additions and 233 deletions.
diff --git a/.github/workflows/main-publish.yml b/.github/workflows/main-publish.yml
@@ -26,7 +26,7 @@ jobs:
       - uses: actions/checkout@v2
       - uses: actions/setup-python@v2
         with:
-          python-version: 3.8
+          python-version: 3.9
       - name: Install Python dependencies for publish
         run: python -m pip install dcicutils==7.5.0
       - name: Publish

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -25,7 +25,7 @@ jobs:
     # Build matrix
     strategy:
       matrix:
-        python_version: [3.8, 3.11]
+        python_version: [3.9, 3.11, 3.12]
         test_type: ['UNIT', 'INDEXING']
 
     # Steps represent a sequence of tasks that will be executed as part of the job

diff --git a/.github/workflows/static-checks.yml b/.github/workflows/static-checks.yml
@@ -27,7 +27,7 @@ jobs:
       - uses: actions/checkout@v2
       - uses: actions/setup-python@v2
         with:
-          python-version: 3.8
+          python-version: 3.9
       - name: Python Package Setup
         run: |
           make build-for-ga

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -6,12 +6,66 @@ snovault
 Change Log
 ----------
 
-11.16.1
+11.22.0
 =======
 
 * Update ``drs`` validation to remove drs_uri
 
 
+11.21.1
+=======
+
+* Minor changes to allow running (for example) both cgap-portal and smaht-portal
+  simultaneously locally, for localhost/dev purposes only:
+  -  Minor updates to dev_servers.py and tests/elasticsearch_fixture.py
+     to allow defining transport_port for elasticsearch.
+  -  Minor updates to dev_servers.py and tests/postgresql_fixture.py to allow
+     parsing sqlalchemy.url in the ini file (e.g. development.ini) for the
+     postgres port and temporary directory path.
+
+
+11.21.0
+=======
+
+* Fix in indexing_views.py for frame=raw not including the uuid.
+
+
+11.20.0
+=======
+
+* Bug fix: use loadxl_order() in staggered reindexing
+* Add B-tree index to rid column in propsheets to optimize revision history retrieval
+
+
+11.19.0
+=======
+
+* Fix for revision history - deepcopy history as to not modify props in place
+
+
+11.18.0
+=======
+
+* Dropped support for Python 3.8.
+* Updates related to Python 3.12.
+  - Had to update venusian (from 1.2.0) to 3.1.0.
+  - Had to update pyramid (from 1.10.4) to 1.10.8 (for imp import not found).
+    - Had to add pmdarima (no module pyramid.compat).
+    - Had to define/update numpy (to 1.26.4) for this as it was implicitly,
+      due to something else, using 1.24.4 which failed to build with Python 3.12.
+      - And had to update lower bound of Python (from 3.8.1) to 3.9 for this.
+  - Had to update dcicutils (from 8.11.0) to 8.13.0  (for pyramid update for imp import not found).
+* Minor change to dev_servers.py to facilitate running a local ElasticSearch proxy
+  to observe traffic (resquests/responses) between the portal and ElasticSearch
+  with a tool like mitmproxy or mitmweb; see comments in dev_server.py.
+
+
+11.17.0
+=======
+
+* Add `/routes` endpoint to return all routes and select item views in the application
+
+
 11.16.0
 =======
 

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "dcicsnovault"
-version = "11.16.0.1b0"
+version = "11.21.1.0b0"
 description = "Storage support for 4DN Data Portals."
 authors = ["4DN-DCIC Team <[email protected]>"]
 license = "MIT"
@@ -34,27 +34,34 @@ classifiers = [
     'Programming Language :: Python :: 3.9',
     'Programming Language :: Python :: 3.10',
     'Programming Language :: Python :: 3.11',
+    'Programming Language :: Python :: 3.12'
 ]
 
 [tool.poetry.dependencies]
-python = ">=3.8.1,<3.12"
+python = ">=3.9,<3.13"
 aws_requests_auth = "^0.4.1"
-boto3 = ">=1.28.62"  # no particular version required, but this speeds up search
-botocore = ">=1.31.62"  # no particular version required, but this speeds up search
+boto3 = "^1.34.136"
+botocore = "^1.34.136"
 elasticsearch = "7.13.4"  # versions >= 7.14.0 lock out AWS ES
 elasticsearch_dsl = "^7.4.0"
-dcicutils = "^8.11.0"
-future = ">=0.15.2,<1"
+#python-3.12 elasticsearch = "^7.17.9"
+#python-3.12 elasticsearch_dsl = "^7.4.1"
+dcicutils = "^8.14.0"
+future = "^0.18.3"
 html5lib = ">=1.1"  # experimental, should be OK now that we're not using moto server
 humanfriendly = "^1.44.9"
 netaddr = ">=0.8.0,<1"
+numpy = "^1.26.4"
 passlib = "^1.7.4"
 pillow = "^9.5.0"
+pmdarima = "^2.0.4"
 psutil = "^5.9.0"
 psycopg2-binary = "^2.9.1"
 PyBrowserID = ">=0.10.0,<1"
 pyjwt = "^2.6.0"
-pyramid = "1.10.4"
+#pyramid = "^2.0.2"
+#pyramid = "1.10.4"
+pyramid = "1.10.8"
 pyramid-multiauth = ">=0.9.0,<1"
 pyramid-retry = "^1.0"
 pyramid-tm = "^2.5"
@@ -75,7 +82,7 @@ subprocess_middleware = ">=0.3,<1"
 # TODO: Investigate whether a major version upgrade is allowable for 'transaction'.
 transaction = "^3.0.1"
 # TODO: Investigate whether a major version upgrade is allowable for 'venusian'.
-venusian = "^1.2.0"
+venusian = "^3.1.0"
 WebOb = "^1.8.7"
 WebTest = "^2.0.35"
 WSGIProxy2 = "0.4.2"
@@ -86,8 +93,8 @@ xlrd = "^1.0.0"
 jsonschema = {extras = ["format-nongpl"], version = "^4.19.0"}
 
 [tool.poetry.dev-dependencies]
-boto3-stubs = ">=1.28.62"  # no particular version required, but this speeds up search
-botocore-stubs = ">=1.31.62"  # no particular version required, but this speeds up search
+boto3-stubs = "^1.34.136"
+botocore-stubs = "^1.34.136"
 coverage = ">=6.2"
 codacy-coverage = ">=1.3.11"
 # When we add coverage, this must be loaded manually in GA workflow for coverage because a dependency on 2to3

diff --git a/snovault/__init__.py b/snovault/__init__.py
@@ -50,6 +50,7 @@ def includeme(config):
     config.include('snovault.resource_views')
     config.include('snovault.settings')
     config.include('snovault.server_defaults')
+    config.include('snovault.routes')
 
 
 def main(global_config, **local_config):

diff --git a/snovault/commands/create_mapping_on_deploy.py b/snovault/commands/create_mapping_on_deploy.py
@@ -66,7 +66,7 @@ def _run_create_mapping(app, args):
             log.info('Overriding deploy_cfg and wiping ES')
             deploy_cfg['WIPE_ES'] = True
         run_create_mapping(app, check_first=(not deploy_cfg['WIPE_ES']), purge_queue=args.clear_queue,
-                           item_order=loadxl_order)
+                           item_order=loadxl_order())
     except Exception as e:
         log.error("Exception encountered while gathering deployment information or running create_mapping")
         log.error(str(e))

diff --git a/snovault/crud_views.py b/snovault/crud_views.py
@@ -3,7 +3,7 @@
     UUID,
     uuid4,
 )
-
+from copy import deepcopy
 import transaction
 from pyramid.exceptions import HTTPForbidden
 from pyramid.settings import asbool
@@ -379,7 +379,7 @@ def get_item_revision_history(request, uuid):
         The more edits an item has undergone, the more expensive this
         operation is.
     """
-    revisions = request.registry[STORAGE].revision_history(uuid=uuid)
+    revisions = deepcopy(request.registry[STORAGE].revision_history(uuid=uuid))
     # Resolve last_modified
     # NOTE: last_modified is a server_default present in our applications that
     # use snovault. This code is intended to resolve the user email of the last_modified
@@ -393,7 +393,7 @@ def get_item_revision_history(request, uuid):
             if not user:
                 user = request.embed(modified_by, frame='raw')
                 user_cache[modified_by] = user
-            revision['last_modified']['modified_by'] = user.get('email', 'No email specified!')
+            revision['last_modified']['modified_by'] = user['email']
     return revisions
 
 

diff --git a/snovault/dev_servers.py b/snovault/dev_servers.py
@@ -14,6 +14,7 @@
 import shutil
 import subprocess
 import sys
+from urllib.parse import urlparse as url_parse, parse_qs as url_parse_query
 
 from dcicutils.misc_utils import PRINT
 from pyramid.paster import get_app, get_appsettings
@@ -24,6 +25,7 @@
 
 
 EPILOG = __doc__
+DEFAULT_DATA_DIR = "/tmp/snovault"
 
 logger = logging.getLogger(__name__)
 
@@ -103,7 +105,7 @@ def main():
     parser.add_argument('--clear', action="store_true", help="Clear existing data")
     parser.add_argument('--init', action="store_true", help="Init database")
     parser.add_argument('--load', action="store_true", help="Load test set")
-    parser.add_argument('--datadir', default='/tmp/snovault', help="path to datadir")
+    parser.add_argument('--datadir', default=None, help="path to datadir")
     parser.add_argument('--no_ingest', action="store_true", default=False, help="Don't start the ingestion process.")
     args = parser.parse_args()
 
@@ -125,6 +127,29 @@ def run(app_name, config_uri, datadir, clear=False, init=False, load=False, inge
     # TODO: This variable seems to not get used? -kmp 25-Jul-2020
     config = get_appsettings(config_uri, app_name)
 
+    if sqlalchemy_url := config.get("sqlalchemy.url", None):
+        # New as of 2024-07-30 (dmichaels).
+        # Handle sqlalchemy.url property defined in development.ini that looks something like this:
+        # sqlalchemy.url = postgresql://postgres@localhost:5442/postgres?host=/tmp/snovault/pgdata
+        # This allows us to get the temporary data directory (from the URL host query-string, for both
+        # Postgres and ElasticSearch, e.g. /tmp/snovault) and the Postgres port (from the URL port),
+        # so that we can easily change where Postgres is running to support (for example) running
+        # both smaht-portal and cgap-portal locally simultaneously. This also obviates the need
+        # in the portal makefiles to parse out the port from this (sqlalchemy.url) property to
+        # set the SNOVAULT_DB_TEST_PORT environment variable as was currently done.
+        sqlalchemy_url_parsed = url_parse(sqlalchemy_url)
+        sqlalchemy_url_port = sqlalchemy_url_parsed.port
+        sqlalchemy_url_query = url_parse_query(sqlalchemy_url_parsed.query)
+        if sqlalchemy_url_host := sqlalchemy_url_query.get("host", [None])[0]:
+            if sqlalchemy_url_host.endswith("/pgdata"):
+                sqlalchemy_url_host = sqlalchemy_url_host[:-len("/pgdata")]
+        if (datadir is None) and sqlalchemy_url_host:
+            datadir = sqlalchemy_url_host
+        if (os.environ.get("SNOVAULT_DB_TEST_PORT", None) is None) and sqlalchemy_url_port:
+            os.environ["SNOVAULT_DB_TEST_PORT"] =  str(sqlalchemy_url_port)
+    if not datadir:
+        datadir = DEFAULT_DATA_DIR
+
     datadir = os.path.abspath(datadir)
     pgdata = os.path.join(datadir, 'pgdata')
     esdata = os.path.join(datadir, 'esdata')
@@ -154,14 +179,28 @@ def cleanup_process():
     processes = []
 
     # For now - required components
-    postgres = postgresql_fixture.server_process(pgdata, echo=True)
+    postgres = postgresql_fixture.server_process(pgdata, echo=True, port=os.environ.get("SNOVAULT_DB_TEST_PORT"))
     processes.append(postgres)
 
     es_server_url = config.get('elasticsearch.server', "localhost")
 
     if '127.0.0.1' in es_server_url or 'localhost' in es_server_url:
-        # Bootup local ES server subprocess. Else assume connecting to remote ES cluster.
-        elasticsearch = elasticsearch_fixture.server_process(esdata, echo=True)
+        # Bootup local ES server subprocess; otherwise assume we are connecting to a remote ES cluster.
+        # The elasticsearch.server.actual_port property is useful (only) for running a localhost ElasticSearch
+        # proxy in order to observe traffic (requests/responses) between portal and ElasticSearch with a tool like
+        # mitmweb; e.g. setting elasticsearch.server.actual_port to 9201 and elasticsearch.server to localhost:9200
+        # will cause ElasticSearch to actually run on port 9201 but will cause portal to talk to it via port 9200,
+        # and then we can run mitmweb --mode reverse:http://localhost:9201 -p 9200 --web-port 8081 which will
+        # allow us to browse to http://localhost:8081 locally to observe all of the ElasticSearch traffic.
+        if (es_port := config.get('elasticsearch.server.actual_port', None)) and es_port.isdigit():
+            es_port = int(es_port)
+        elif ((colon := es_server_url.rfind(":")) > 0) and (es_port := es_server_url[colon + 1:]).isdigit():
+            es_port = int(es_port)
+        else:
+            es_port = None
+        transport_ports = config.get('elasticsearch.server.transport_ports', None)
+        elasticsearch = elasticsearch_fixture.server_process(esdata, port=es_port, echo=True,
+                                                             transport_ports=transport_ports)
         processes.append(elasticsearch)
     elif not config.get('indexer.namespace'):
         raise Exception(
@@ -201,6 +240,8 @@ def cleanup_process():
 
         # now clear the queues and queue items for indexing
         create_mapping.run(app, check_first=True, strict=True, purge_queue=False)
+        # To test prod setup:
+        # create_mapping.reindex_by_type_staggered(app)
 
     PRINT('Started. ^C to exit.')
 

diff --git a/snovault/elasticsearch/create_mapping.py b/snovault/elasticsearch/create_mapping.py
@@ -33,6 +33,7 @@
 # from ..commands.es_index_data import run as run_index_data
 from ..schema_utils import combine_schemas
 from ..tools import make_indexer_testapp
+from ..project_app import app_project
 from ..util import (
     add_default_embeds, IndexSettings,
     NUM_SHARDS, NUM_REPLICAS, SEARCH_MAX, KW_IGNORE_ABOVE, MIN_NGRAM,
@@ -874,11 +875,11 @@ def build_index(app, es, index_name, in_type, mapping, uuids_to_index, dry_run,
     start = timer()
     coll_uuids = set(get_uuids_for_types(app.registry, types=[in_type]))
     end = timer()
-    log.info('Time to get collection uuids: %s' % str(end-start), cat='fetch time',
-             duration=str(end-start), collection=in_type)
+    log.warning('Time to get collection uuids: %s' % str(end-start), cat='fetch time',
+                duration=str(end-start), collection=in_type)
     uuids_to_index[in_type] = coll_uuids
-    log.info('MAPPING: will queue all %s items in the new index %s for reindexing' %
-             (len(coll_uuids), in_type), cat='items to queue', count=len(coll_uuids), collection=in_type)
+    log.warning('MAPPING: will queue all %s items in the new index %s for reindexing' %
+                (len(coll_uuids), in_type), cat='items to queue', count=len(coll_uuids), collection=in_type)
 
 
 def check_if_index_exists(es, in_type):
@@ -1383,7 +1384,7 @@ def reindex_by_type_staggered(app):
     registry = app.registry
     es = registry[ELASTIC_SEARCH]
     indexer_queue = registry[INDEXER_QUEUE]
-    all_types = registry[COLLECTIONS].by_item_type
+    all_types = app_project().loadxl_order()
 
     log.warning('Running staggered create_mapping command - wiping and reindexing indices sequentially'
                 ' to minimize downtime.')
@@ -1396,10 +1397,11 @@ def reindex_by_type_staggered(app):
         uuids = {}
         build_index(app, es, namespaced_index, i_type, mapping, uuids, False)
         mapping_end = timer()
-        to_index_list = flatten_and_sort_uuids(app.registry, uuids, None)
+        to_index_list = flatten_and_sort_uuids(app.registry, uuids, None)  # none here is fine since we pre-ordered
         indexer_queue.add_uuids(app.registry, to_index_list, strict=True,
                                 target_queue='secondary')
-        log.warning(f'Queued type {i_type} in {mapping_end - current_start}')
+        log.warning(f'Queued type {i_type} ({len(to_index_list)} total items) in {mapping_end - current_start}')
+        log.warning(f'First 10 items: {list(uuid for uuid in to_index_list[0:10])}')
         time.sleep(10)  # give queue some time to catch up
         while not indexer_queue.queue_is_empty():
             time.sleep(10)  # check every 10 seconds

diff --git a/snovault/embed.py b/snovault/embed.py
@@ -1,14 +1,14 @@
 import logging
 from copy import deepcopy
 from posixpath import join
-from pyramid.compat import (
-    native_,
-    unquote_bytes_to_wsgi,
-)
 from pyramid.httpexceptions import HTTPNotFound, HTTPServerError
 import pyramid.request
 from .crud_views import collection_add as sno_collection_add
 from .interfaces import COLLECTIONS, CONNECTION
+from .pyramid_compat import (
+    native_,
+    unquote_bytes_to_wsgi,
+)
 from .resources import Collection
 from .schema_utils import validate_request
 from dcicutils.misc_utils import check_true

diff --git a/snovault/indexing_views.py b/snovault/indexing_views.py
@@ -108,6 +108,10 @@ def item_index_data(context, request):
     with indexing_timer(indexing_stats, 'upgrade_properties'):
         properties = context.upgrade_properties()
 
+    # 2024-07-09: Make sure that the uuid gets into the frame=raw view.
+    if not properties.get('uuid'):
+        properties['uuid'] = uuid
+
     # ES versions 2 and up don't allow dots in links. Update these to use ~s
     new_links = {}
     for key, val in context.links(properties).items():

diff --git a/snovault/local_roles.py b/snovault/local_roles.py
@@ -1,6 +1,6 @@
 from dcicutils.misc_utils import environ_bool, PRINT
 from pyramid.authorization import ACLAuthorizationPolicy
-from pyramid.compat import is_nonstr_iter
+from pyramid.util import is_nonstr_iter
 from pyramid.interfaces import IAuthorizationPolicy
 from pyramid.location import lineage
 from zope.interface import implementer