Merge pull request #76 from IQSS/develop

Update from IQSS develop
QualitativeDataRepository · Oct 19, 2020 · bc9e4c8 · bc9e4c8
2 parents 2fb9106 + 3a2f2cc
commit bc9e4c8
Show file tree

Hide file tree

Showing 69 changed files with 2,284 additions and 1,140 deletions.
diff --git a/.gitignore b/.gitignore
@@ -34,6 +34,7 @@ oauth-credentials.md
 
 /src/main/webapp/oauth2/newAccount.html
 scripts/api/setup-all.sh*
+scripts/api/setup-all.*.log
 
 # ctags generated tag file
 tags

diff --git a/conf/solr/7.7.2/schema_dv_mdb_copies.xml b/conf/solr/7.7.2/schema_dv_mdb_copies.xml
@@ -133,9 +133,13 @@
     <copyField source="studyAssayOtherMeasurmentType" dest="_text_" maxChars="3000"/>
     <copyField source="studyAssayOtherOrganism" dest="_text_" maxChars="3000"/>
     <copyField source="studyAssayPlatform" dest="_text_" maxChars="3000"/>
+    <copyField source="studyAssayOtherPlatform" dest="_text_" maxChars="3000"/>
     <copyField source="studyAssayTechnologyType" dest="_text_" maxChars="3000"/>
+    <copyField source="studyAssayOtherTechnologyType" dest="_text_" maxChars="3000"/>
     <copyField source="studyDesignType" dest="_text_" maxChars="3000"/>
+    <copyField source="studyOtherDesignType" dest="_text_" maxChars="3000"/>
     <copyField source="studyFactorType" dest="_text_" maxChars="3000"/>
+    <copyField source="studyOtherFactorType" dest="_text_" maxChars="3000"/>
     <copyField source="subject" dest="_text_" maxChars="3000"/>
     <copyField source="subtitle" dest="_text_" maxChars="3000"/>
     <copyField source="targetSampleActualSize" dest="_text_" maxChars="3000"/>
@@ -154,4 +158,4 @@
     <copyField source="universe" dest="_text_" maxChars="3000"/>
     <copyField source="weighting" dest="_text_" maxChars="3000"/>
     <copyField source="westLongitude" dest="_text_" maxChars="3000"/>
-</schema>
+</schema>
diff --git a/conf/solr/7.7.2/schema_dv_mdb_fields.xml b/conf/solr/7.7.2/schema_dv_mdb_fields.xml
@@ -133,9 +133,13 @@
     <field name="studyAssayOtherMeasurmentType" type="text_en" multiValued="true" stored="true" indexed="true"/>
     <field name="studyAssayOtherOrganism" type="text_en" multiValued="true" stored="true" indexed="true"/>
     <field name="studyAssayPlatform" type="text_en" multiValued="true" stored="true" indexed="true"/>
+    <field name="studyAssayOtherPlatform" type="text_en" multiValued="true" stored="true" indexed="true"/>
     <field name="studyAssayTechnologyType" type="text_en" multiValued="true" stored="true" indexed="true"/>
+    <field name="studyAssayOtherTechnologyType" type="text_en" multiValued="true" stored="true" indexed="true"/>
     <field name="studyDesignType" type="text_en" multiValued="true" stored="true" indexed="true"/>
+    <field name="studyOtherDesignType" type="text_en" multiValued="true" stored="true" indexed="true"/>
     <field name="studyFactorType" type="text_en" multiValued="true" stored="true" indexed="true"/>
+    <field name="studyOtherFactorType" type="text_en" multiValued="true" stored="true" indexed="true"/>	    
     <field name="subject" type="text_en" multiValued="true" stored="true" indexed="true"/>
     <field name="subtitle" type="text_en" multiValued="false" stored="true" indexed="true"/>
     <field name="targetSampleActualSize" type="text_en" multiValued="false" stored="true" indexed="true"/>
@@ -154,4 +158,4 @@
     <field name="universe" type="text_en" multiValued="true" stored="true" indexed="true"/>
     <field name="weighting" type="text_en" multiValued="false" stored="true" indexed="true"/>
     <field name="westLongitude" type="text_en" multiValued="true" stored="true" indexed="true"/>
-</fields>
+</fields>
diff --git a/doc/release-notes/5.1-release-notes.md b/doc/release-notes/5.1-release-notes.md
@@ -0,0 +1,99 @@
+# Dataverse 5.1
+
+This release brings new features, enhancements, and bug fixes to Dataverse. Thank you to all of the community members who contributed code, suggestions, bug reports, and other assistance across the project.
+
+## Release Highlights
+
+### Large File Upload for Installations Using AWS S3
+
+The added support for multipart upload through the API and UI (Issue #6763) will allow files larger than 5 GB to be uploaded to Dataverse when an installation is running on AWS S3. Previously, only non-AWS S3 storage configurations would allow uploads larger than 5 GB.  
+
+### Dataset-Specific Stores
+
+In previous releases, configuration options were added that allow each dataverse to have a specific store enabled. This release adds even more granularity, with the ability to set a dataset-level store.
+
+## Major Use Cases
+
+Newly-supported use cases in this release include:
+
+- Users can now upload files larger than 5 GB on installations running AWS S3 (Issue #6763, PR #6995)
+- Administrators will now be able to specify a store at the dataset level in addition to the Dataverse level (Issue #6872, PR #7272)
+- Users will have their dataset's directory structure retained when uploading a dataset with shapefiles (Issue #6873, PR #7279)
+- Users will now be able to download zip files through the experimental Zipper service when the set of downloaded files have duplicate names (Issue [#80](https://github.com/IQSS/dataverse.harvard.edu/issues/80), PR #7276)
+- Users will now be able to download zip files with the proper file structure through the experiment Zipper service (Issue #7255, PR #7258)
+- Administrators will be able to use new APIs to keep the Solr index and the DB in sync, allowing easier resolution of an issue that would occasionally cause stale search results to not load. (Issue #4225, PR #7211)
+
+## Notes for Dataverse Installation Administrators
+
+### New API for setting a Dataset-level Store
+
+- This release adds a new API for setting a dataset-specific store. Learn more in the Managing Dataverse and Datasets section of the [Admin Guide](http://guides.dataverse.org/en/5.1/admin/solr-search-index.html).
+
+### Multipart Upload Storage Monitoring, Recommended Use for Multipart Upload
+
+Charges may be incurred for storage reserved for multipart uploads that are not completed or cancelled. Administrators may want to do periodic manual or automated checks for open multipart uploads. Learn more in the Big Data Support section of the [Developers Guide](http://guides.dataverse.org/en/5.1/developer/big-data-support.html).
+
+While multipart uploads can support much larger files, and can have advantages in terms of robust transfer and speed, they are more complex than single part direct uploads. Administrators should consider taking advantage of the options to limit use of multipart uploads to specific users by using multiple stores and configuring access to stores with high file size limits to specific Dataverses (added in 4.20) or Datasets (added in this release).
+
+### New APIs for keeping Solr records in sync
+
+This release adds new APIs to keep the Solr index and the DB in sync, allowing easier resolution of an issue that would occasionally cause search results to not load. Learn more in the Solr section of the [Admin Guide](http://guides.dataverse.org/en/5.1/admin/solr-search-index.html).
+
+### Documentation for Purging the Ingest Queue
+
+At times, it may be necessary to cancel long-running Ingest jobs in the interest of system stability. The Troubleshooting section of the [Admin Guide](http://guides.dataverse.org/en/5.1/admin/) now has specific steps.
+
+### Biomedical Metadata Block Updated
+
+The Life Science Metadata block (biomedical.tsv) was updated.  "Other Design Type", "Other Factor Type", "Other Technology Type", "Other Technology Platform" boxes were added. See the "Additional Upgrade Steps" below if you use this in your installation.
+
+## Notes for Tool Developers and Integrators
+
+### Spaces in File Names
+
+Dataverse Installations using S3 storage will no longer replace spaces in file names of downloaded files with the + character. If your tool or integration has any special handling around this, you may need to make further adjustments to maintain backwards compatibility while also supporting Dataverse installations on 5.1+.
+
+## Complete List of Changes
+
+For the complete list of code changes in this release, see the [5.1 Milestone](https://github.com/IQSS/dataverse/milestone/90?closed=1) in Github.
+
+For help with upgrading, installing, or general questions please post to the [Dataverse Google Group](https://groups.google.com/forum/#!forum/dataverse-community) or email [email protected].
+
+## Installation
+
+If this is a new installation, please see our [Installation Guide](http://guides.dataverse.org/en/5.1/installation/)
+
+## Upgrade Instructions
+
+0. These instructions assume that you've already successfully upgraded from Dataverse 4.x to  Dataverse 5 following the instructions in the [Dataverse 5 Release Notes](https://github.com/IQSS/dataverse/releases/tag/v5.0).
+
+1. Undeploy the previous version.
+
+<payara install path>/payara/bin/asadmin list-applications
+<payara install path>/payara/bin/asadmin undeploy dataverse
+
+2. Stop payara and remove the generated directory, start.
+
+- service payara stop
+- remove the generated directory: rm -rf <payara install path>payara/payara/domains/domain1/generated
+- service payara start
+
+3. Deploy this version.
+<payara install path>/payara/bin/asadmin deploy <path>dataverse-5.1.war
+
+4. Restart payara
+
+### Additional Upgrade Steps
+
+1. Update Biomedical Metadata Block (if used), Reload Solr, ReExportAll
+
+   `wget https://github.com/IQSS/dataverse/releases/download/v5.1/biomedical.tsv`
+   `curl http://localhost:8080/api/admin/datasetfield/load -X POST --data-binary @biomedical.tsv -H "Content-type: text/tab-separated-values"`
+
+- copy schema_dv_mdb_fields.xml and schema_dv_mdb_copies.xml to solr server, for example into /usr/local/solr/solr-7.7.2/server/solr/collection1/conf/ directory
+- Restart Solr, or tell Solr to reload its configuration:
+
+   `curl "http://localhost:8983/solr/admin/cores?action=RELOAD&core=collection1"`
+
+- Run ReExportall to update JSON Exports  
+<http://guides.dataverse.org/en/5.1/admin/metadataexport.html?highlight=export#batch-exports-through-the-api>
diff --git a/doc/release-notes/5.1.1-release-notes.md b/doc/release-notes/5.1.1-release-notes.md
@@ -0,0 +1,59 @@
+# Dataverse 5.1.1
+
+This minor release adds important scaling improvements for installations running on AWS S3. It is recommended that 5.1.1 be used in production instead of 5.1.
+
+## Release Highlights
+
+### Connection Pool Size Configuration Option, Connection Optimizations
+
+Dataverse 5.1 improved the efficiency of making S3 connections through use of an http connection pool. This release adds optimizations around closing streams and channels that may hold S3 http connections open and exhaust the connection pool. In parallel, this release increases the default pool size from 50 to 256 and adds the ability to increase the size of the connection pool, so a larger pool can be configured if needed.
+
+## Major Use Cases
+
+Newly-supported use cases in this release include:
+
+- Administrators of installations using S3 will be able to define the connection pool size, allowing better resource scaling for larger installations (Issue #7309, PR #7313)
+
+## Notes for Dataverse Installation Administrators
+
+### 5.1.1 vs. 5.1 for Production Use
+
+As mentioned above, we encourage 5.1.1 instead of 5.1 for production use.
+
+### New JVM Option for Connection Pool Size
+
+Larger installations may want to increase the number of open S3 connections allowed (default is 256). For example, to set the value to 4096:
+
+``./asadmin create-jvm-options "-Ddataverse.files.<id>.connection-pool-size=4096"`
+
+The JVM Options section of the [Configuration Guide](http://guides.dataverse.org/en/5.1.1/installation/config/) has more information.
+
+## Complete List of Changes
+
+For the complete list of code changes in this release, see the [5.1.1 Milestone](https://github.com/IQSS/dataverse/milestone/91?closed=1) in Github.
+
+For help with upgrading, installing, or general questions please post to the [Dataverse Google Group](https://groups.google.com/forum/#!forum/dataverse-community) or email [email protected].
+
+## Installation
+
+If this is a new installation, please see our [Installation Guide](http://guides.dataverse.org/en/5.1.1/installation/)
+
+## Upgrade Instructions
+
+0. These instructions assume that you've already successfully upgraded to Dataverse 5.1 following the instructions in the [Dataverse 5.1 Release Notes](https://github.com/IQSS/dataverse/releases/tag/v5.1).
+
+1. Undeploy the previous version.
+
+<payara install path>/payara/bin/asadmin list-applications
+<payara install path>/payara/bin/asadmin undeploy dataverse
+
+2. Stop payara and remove the generated directory, start.
+
+- service payara stop
+- remove the generated directory: rm -rf <payara install path>payara/payara/domains/domain1/generated
+- service payara start
+
+3. Deploy this version.
+<payara install path>/payara/bin/asadmin deploy <path>dataverse-5.1.1.war
+
+4. Restart payara
diff --git a/doc/release-notes/6763-multipart-uploads.md b/doc/release-notes/6763-multipart-uploads.md
diff --git a/doc/release-notes/7140-google-cloud.md b/doc/release-notes/7140-google-cloud.md
@@ -0,0 +1,12 @@
+## Google Cloud Archiver
+
+Dataverse Bags can now be sent to a bucket in Google Cloud, including those in the 'Coldline' storage class, which provide less expensive but slower access.
+
+## Use Cases
+
+- As an Administrator I can set up a regular export to Google Cloud so that my users' data is preserved.
+
+## New Settings
+
+:GoogleCloudProject - the name of the project managing the bucket.
+:GoogleCloudBucket - the name of the bucket to use
diff --git a/doc/release-notes/7184-spaces-in-filenames.md b/doc/release-notes/7184-spaces-in-filenames.md
diff --git a/doc/sphinx-guides/source/admin/dataverses-datasets.rst b/doc/sphinx-guides/source/admin/dataverses-datasets.rst
@@ -59,6 +59,8 @@ The available drivers can be listed with::
 
     curl -H "X-Dataverse-key: $API_TOKEN" http://$SERVER/api/admin/dataverse/storageDrivers
     
+(Individual datasets can be configured to use specific file stores as well. See the "Datasets" section below.)
+
 
 Datasets
 --------
@@ -130,3 +132,23 @@ Diagnose Constraint Violations Issues in Datasets
 
 To identify invalid data values in specific datasets (if, for example, an attempt to edit a dataset results in a ConstraintViolationException in the server log), or to check all the datasets in the Dataverse for constraint violations, see :ref:`Dataset Validation <dataset-validation-api>` in the :doc:`/api/native-api` section of the User Guide.
 
+Configure a Dataset to store all new files in a specific file store
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Configure a dataset to use a specific file store (this API can only be used by a superuser) ::
+ 
+    curl -H "X-Dataverse-key: $API_TOKEN" -X PUT -d $storageDriverLabel http://$SERVER/api/datasets/$dataset-id/storageDriver
+    
+The current driver can be seen using::
+
+    curl http://$SERVER/api/datasets/$dataset-id/storageDriver
+
+It can be reset to the default store as follows (only a superuser can do this) ::
+
+    curl -H "X-Dataverse-key: $API_TOKEN" -X DELETE http://$SERVER/api/datasets/$dataset-id/storageDriver
+    
+The available drivers can be listed with::
+
+    curl -H "X-Dataverse-key: $API_TOKEN" http://$SERVER/api/admin/dataverse/storageDrivers
+    
+
diff --git a/doc/sphinx-guides/source/admin/solr-search-index.rst b/doc/sphinx-guides/source/admin/solr-search-index.rst
@@ -14,6 +14,18 @@ There are two ways to perform a full reindex of the Dataverse search index. Star
 Clear and Reindex
 +++++++++++++++++
 
+
+Index and Database Consistency
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Get a list of all database objects that are missing in Solr, and Solr documents that are missing in the database:
+
+``curl http://localhost:8080/api/admin/index/status``
+
+Remove all Solr documents that are orphaned (ie not associated with objects in the database):
+
+``curl http://localhost:8080/api/admin/index/clear-orphans``
+
 Clearing Data from Solr
 ~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -81,4 +93,4 @@ If you suspect something isn't indexed properly in solr, you may bypass the Data
 
 ``curl "http://localhost:8983/solr/collection1/select?q=dsPersistentId:doi:10.15139/S3/HFV0AO"``
 
-to see the JSON you were hopefully expecting to see passed along to Dataverse.
+to see the JSON you were hopefully expecting to see passed along to Dataverse.
diff --git a/doc/sphinx-guides/source/admin/troubleshooting.rst b/doc/sphinx-guides/source/admin/troubleshooting.rst
@@ -43,6 +43,26 @@ A User Needs Their Account to Be Converted From Institutional (Shibboleth), ORCI
 
 See :ref:`converting-shibboleth-users-to-local` and :ref:`converting-oauth-users-to-local`.
 
+.. _troubleshooting-ingest:
+
+Ingest
+------
+
+Long-Running Ingest Jobs Have Exhausted System Resources
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Ingest is both CPU- and memory-intensive, and depending on your system resources and the size and format of tabular data files uploaded, may render Dataverse unresponsive or nearly inoperable. It is possible to cancel these jobs by purging the ingest queue.
+
+``/usr/local/payara5/mq/bin/imqcmd -u admin query dst -t q -n DataverseIngest`` will query the DataverseIngest destination. The password, unless you have changed it, matches the username.
+
+``/usr/local/payara5/mq/bin/imqcmd -u admin purge dst -t q -n DataverseIngest`` will purge the DataverseIngest queue, and prompt for your confirmation.
+
+Finally, list destinations to verify that the purge was successful::
+
+``/usr/local/payara5/mq/bin/imqcmd -u admin list dst``
+
+If you are still running Glassfish, substitute glassfish4 for payara5 above. If you have installed Dataverse in some other location, adjust the above paths accordingly.
+
 .. _troubleshooting-payara:
 
 Payara