diff --git a/.gitignore b/.gitignore index ec135fb3618..4d08cfb2257 100644 --- a/.gitignore +++ b/.gitignore @@ -34,6 +34,7 @@ oauth-credentials.md /src/main/webapp/oauth2/newAccount.html scripts/api/setup-all.sh* +scripts/api/setup-all.*.log # ctags generated tag file tags diff --git a/conf/solr/7.7.2/schema_dv_mdb_copies.xml b/conf/solr/7.7.2/schema_dv_mdb_copies.xml index 0208fdf3910..080cc71ef50 100644 --- a/conf/solr/7.7.2/schema_dv_mdb_copies.xml +++ b/conf/solr/7.7.2/schema_dv_mdb_copies.xml @@ -133,9 +133,13 @@ + + + + @@ -154,4 +158,4 @@ - \ No newline at end of file + diff --git a/conf/solr/7.7.2/schema_dv_mdb_fields.xml b/conf/solr/7.7.2/schema_dv_mdb_fields.xml index 6caa7c6de69..3f844c6183c 100644 --- a/conf/solr/7.7.2/schema_dv_mdb_fields.xml +++ b/conf/solr/7.7.2/schema_dv_mdb_fields.xml @@ -133,9 +133,13 @@ + + + + @@ -154,4 +158,4 @@ - \ No newline at end of file + diff --git a/doc/release-notes/5.1-release-notes.md b/doc/release-notes/5.1-release-notes.md new file mode 100644 index 00000000000..3d106b2df7b --- /dev/null +++ b/doc/release-notes/5.1-release-notes.md @@ -0,0 +1,99 @@ +# Dataverse 5.1 + +This release brings new features, enhancements, and bug fixes to Dataverse. Thank you to all of the community members who contributed code, suggestions, bug reports, and other assistance across the project. + +## Release Highlights + +### Large File Upload for Installations Using AWS S3 + +The added support for multipart upload through the API and UI (Issue #6763) will allow files larger than 5 GB to be uploaded to Dataverse when an installation is running on AWS S3. Previously, only non-AWS S3 storage configurations would allow uploads larger than 5 GB. + +### Dataset-Specific Stores + +In previous releases, configuration options were added that allow each dataverse to have a specific store enabled. This release adds even more granularity, with the ability to set a dataset-level store. + +## Major Use Cases + +Newly-supported use cases in this release include: + +- Users can now upload files larger than 5 GB on installations running AWS S3 (Issue #6763, PR #6995) +- Administrators will now be able to specify a store at the dataset level in addition to the Dataverse level (Issue #6872, PR #7272) +- Users will have their dataset's directory structure retained when uploading a dataset with shapefiles (Issue #6873, PR #7279) +- Users will now be able to download zip files through the experimental Zipper service when the set of downloaded files have duplicate names (Issue [#80](https://github.com/IQSS/dataverse.harvard.edu/issues/80), PR #7276) +- Users will now be able to download zip files with the proper file structure through the experiment Zipper service (Issue #7255, PR #7258) +- Administrators will be able to use new APIs to keep the Solr index and the DB in sync, allowing easier resolution of an issue that would occasionally cause stale search results to not load. (Issue #4225, PR #7211) + +## Notes for Dataverse Installation Administrators + +### New API for setting a Dataset-level Store + +- This release adds a new API for setting a dataset-specific store. Learn more in the Managing Dataverse and Datasets section of the [Admin Guide](http://guides.dataverse.org/en/5.1/admin/solr-search-index.html). + +### Multipart Upload Storage Monitoring, Recommended Use for Multipart Upload + +Charges may be incurred for storage reserved for multipart uploads that are not completed or cancelled. Administrators may want to do periodic manual or automated checks for open multipart uploads. Learn more in the Big Data Support section of the [Developers Guide](http://guides.dataverse.org/en/5.1/developer/big-data-support.html). + +While multipart uploads can support much larger files, and can have advantages in terms of robust transfer and speed, they are more complex than single part direct uploads. Administrators should consider taking advantage of the options to limit use of multipart uploads to specific users by using multiple stores and configuring access to stores with high file size limits to specific Dataverses (added in 4.20) or Datasets (added in this release). + +### New APIs for keeping Solr records in sync + +This release adds new APIs to keep the Solr index and the DB in sync, allowing easier resolution of an issue that would occasionally cause search results to not load. Learn more in the Solr section of the [Admin Guide](http://guides.dataverse.org/en/5.1/admin/solr-search-index.html). + +### Documentation for Purging the Ingest Queue + +At times, it may be necessary to cancel long-running Ingest jobs in the interest of system stability. The Troubleshooting section of the [Admin Guide](http://guides.dataverse.org/en/5.1/admin/) now has specific steps. + +### Biomedical Metadata Block Updated + +The Life Science Metadata block (biomedical.tsv) was updated. "Other Design Type", "Other Factor Type", "Other Technology Type", "Other Technology Platform" boxes were added. See the "Additional Upgrade Steps" below if you use this in your installation. + +## Notes for Tool Developers and Integrators + +### Spaces in File Names + +Dataverse Installations using S3 storage will no longer replace spaces in file names of downloaded files with the + character. If your tool or integration has any special handling around this, you may need to make further adjustments to maintain backwards compatibility while also supporting Dataverse installations on 5.1+. + +## Complete List of Changes + +For the complete list of code changes in this release, see the [5.1 Milestone](https://github.com/IQSS/dataverse/milestone/90?closed=1) in Github. + +For help with upgrading, installing, or general questions please post to the [Dataverse Google Group](https://groups.google.com/forum/#!forum/dataverse-community) or email support@dataverse.org. + +## Installation + +If this is a new installation, please see our [Installation Guide](http://guides.dataverse.org/en/5.1/installation/) + +## Upgrade Instructions + +0. These instructions assume that you've already successfully upgraded from Dataverse 4.x to Dataverse 5 following the instructions in the [Dataverse 5 Release Notes](https://github.com/IQSS/dataverse/releases/tag/v5.0). + +1. Undeploy the previous version. + +/payara/bin/asadmin list-applications +/payara/bin/asadmin undeploy dataverse + +2. Stop payara and remove the generated directory, start. + +- service payara stop +- remove the generated directory: rm -rf payara/payara/domains/domain1/generated +- service payara start + +3. Deploy this version. +/payara/bin/asadmin deploy dataverse-5.1.war + +4. Restart payara + +### Additional Upgrade Steps + +1. Update Biomedical Metadata Block (if used), Reload Solr, ReExportAll + + `wget https://github.com/IQSS/dataverse/releases/download/v5.1/biomedical.tsv` + `curl http://localhost:8080/api/admin/datasetfield/load -X POST --data-binary @biomedical.tsv -H "Content-type: text/tab-separated-values"` + +- copy schema_dv_mdb_fields.xml and schema_dv_mdb_copies.xml to solr server, for example into /usr/local/solr/solr-7.7.2/server/solr/collection1/conf/ directory +- Restart Solr, or tell Solr to reload its configuration: + + `curl "http://localhost:8983/solr/admin/cores?action=RELOAD&core=collection1"` + +- Run ReExportall to update JSON Exports + diff --git a/doc/release-notes/5.1.1-release-notes.md b/doc/release-notes/5.1.1-release-notes.md new file mode 100644 index 00000000000..f5243aebc8f --- /dev/null +++ b/doc/release-notes/5.1.1-release-notes.md @@ -0,0 +1,59 @@ +# Dataverse 5.1.1 + +This minor release adds important scaling improvements for installations running on AWS S3. It is recommended that 5.1.1 be used in production instead of 5.1. + +## Release Highlights + +### Connection Pool Size Configuration Option, Connection Optimizations + +Dataverse 5.1 improved the efficiency of making S3 connections through use of an http connection pool. This release adds optimizations around closing streams and channels that may hold S3 http connections open and exhaust the connection pool. In parallel, this release increases the default pool size from 50 to 256 and adds the ability to increase the size of the connection pool, so a larger pool can be configured if needed. + +## Major Use Cases + +Newly-supported use cases in this release include: + +- Administrators of installations using S3 will be able to define the connection pool size, allowing better resource scaling for larger installations (Issue #7309, PR #7313) + +## Notes for Dataverse Installation Administrators + +### 5.1.1 vs. 5.1 for Production Use + +As mentioned above, we encourage 5.1.1 instead of 5.1 for production use. + +### New JVM Option for Connection Pool Size + +Larger installations may want to increase the number of open S3 connections allowed (default is 256). For example, to set the value to 4096: + +``./asadmin create-jvm-options "-Ddataverse.files..connection-pool-size=4096"` + +The JVM Options section of the [Configuration Guide](http://guides.dataverse.org/en/5.1.1/installation/config/) has more information. + +## Complete List of Changes + +For the complete list of code changes in this release, see the [5.1.1 Milestone](https://github.com/IQSS/dataverse/milestone/91?closed=1) in Github. + +For help with upgrading, installing, or general questions please post to the [Dataverse Google Group](https://groups.google.com/forum/#!forum/dataverse-community) or email support@dataverse.org. + +## Installation + +If this is a new installation, please see our [Installation Guide](http://guides.dataverse.org/en/5.1.1/installation/) + +## Upgrade Instructions + +0. These instructions assume that you've already successfully upgraded to Dataverse 5.1 following the instructions in the [Dataverse 5.1 Release Notes](https://github.com/IQSS/dataverse/releases/tag/v5.1). + +1. Undeploy the previous version. + +/payara/bin/asadmin list-applications +/payara/bin/asadmin undeploy dataverse + +2. Stop payara and remove the generated directory, start. + +- service payara stop +- remove the generated directory: rm -rf payara/payara/domains/domain1/generated +- service payara start + +3. Deploy this version. +/payara/bin/asadmin deploy dataverse-5.1.1.war + +4. Restart payara diff --git a/doc/release-notes/6763-multipart-uploads.md b/doc/release-notes/6763-multipart-uploads.md deleted file mode 100644 index ecec3efd9dc..00000000000 --- a/doc/release-notes/6763-multipart-uploads.md +++ /dev/null @@ -1,3 +0,0 @@ -# Large Data Support (continued) - -Direct S3 uploads now support multi-part uploading of large files (> 1GB by default) via the user interface and the API (which is used in the [Dataverse Uploader](https://github.com/GlobalDataverseCommunityConsortium/dataverse-uploader)). This allows uploads larger than 5 GB when using Amazon AWS S3 stores. \ No newline at end of file diff --git a/doc/release-notes/7140-google-cloud.md b/doc/release-notes/7140-google-cloud.md new file mode 100644 index 00000000000..62aef73acd0 --- /dev/null +++ b/doc/release-notes/7140-google-cloud.md @@ -0,0 +1,12 @@ +## Google Cloud Archiver + +Dataverse Bags can now be sent to a bucket in Google Cloud, including those in the 'Coldline' storage class, which provide less expensive but slower access. + +## Use Cases + +- As an Administrator I can set up a regular export to Google Cloud so that my users' data is preserved. + +## New Settings + +:GoogleCloudProject - the name of the project managing the bucket. +:GoogleCloudBucket - the name of the bucket to use \ No newline at end of file diff --git a/doc/release-notes/7184-spaces-in-filenames.md b/doc/release-notes/7184-spaces-in-filenames.md deleted file mode 100644 index 1a5b41068ce..00000000000 --- a/doc/release-notes/7184-spaces-in-filenames.md +++ /dev/null @@ -1,7 +0,0 @@ -## Notes for Tool Developers and Integrators - -### Filenames - -Dataverse Installations using S3 storage will no longer replace spaces in file names with the + character. If your tool or integration has any special handling around this character change, you can remove it. - -(review this note if this is in the same release as the fix for #7188) \ No newline at end of file diff --git a/doc/sphinx-guides/source/admin/dataverses-datasets.rst b/doc/sphinx-guides/source/admin/dataverses-datasets.rst index 6349088beea..9c122c25abc 100644 --- a/doc/sphinx-guides/source/admin/dataverses-datasets.rst +++ b/doc/sphinx-guides/source/admin/dataverses-datasets.rst @@ -59,6 +59,8 @@ The available drivers can be listed with:: curl -H "X-Dataverse-key: $API_TOKEN" http://$SERVER/api/admin/dataverse/storageDrivers +(Individual datasets can be configured to use specific file stores as well. See the "Datasets" section below.) + Datasets -------- @@ -130,3 +132,23 @@ Diagnose Constraint Violations Issues in Datasets To identify invalid data values in specific datasets (if, for example, an attempt to edit a dataset results in a ConstraintViolationException in the server log), or to check all the datasets in the Dataverse for constraint violations, see :ref:`Dataset Validation ` in the :doc:`/api/native-api` section of the User Guide. +Configure a Dataset to store all new files in a specific file store +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Configure a dataset to use a specific file store (this API can only be used by a superuser) :: + + curl -H "X-Dataverse-key: $API_TOKEN" -X PUT -d $storageDriverLabel http://$SERVER/api/datasets/$dataset-id/storageDriver + +The current driver can be seen using:: + + curl http://$SERVER/api/datasets/$dataset-id/storageDriver + +It can be reset to the default store as follows (only a superuser can do this) :: + + curl -H "X-Dataverse-key: $API_TOKEN" -X DELETE http://$SERVER/api/datasets/$dataset-id/storageDriver + +The available drivers can be listed with:: + + curl -H "X-Dataverse-key: $API_TOKEN" http://$SERVER/api/admin/dataverse/storageDrivers + + diff --git a/doc/sphinx-guides/source/admin/solr-search-index.rst b/doc/sphinx-guides/source/admin/solr-search-index.rst index 07e51b4564f..d37b7eedb26 100644 --- a/doc/sphinx-guides/source/admin/solr-search-index.rst +++ b/doc/sphinx-guides/source/admin/solr-search-index.rst @@ -14,6 +14,18 @@ There are two ways to perform a full reindex of the Dataverse search index. Star Clear and Reindex +++++++++++++++++ + +Index and Database Consistency +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Get a list of all database objects that are missing in Solr, and Solr documents that are missing in the database: + +``curl http://localhost:8080/api/admin/index/status`` + +Remove all Solr documents that are orphaned (ie not associated with objects in the database): + +``curl http://localhost:8080/api/admin/index/clear-orphans`` + Clearing Data from Solr ~~~~~~~~~~~~~~~~~~~~~~~ @@ -81,4 +93,4 @@ If you suspect something isn't indexed properly in solr, you may bypass the Data ``curl "http://localhost:8983/solr/collection1/select?q=dsPersistentId:doi:10.15139/S3/HFV0AO"`` -to see the JSON you were hopefully expecting to see passed along to Dataverse. \ No newline at end of file +to see the JSON you were hopefully expecting to see passed along to Dataverse. diff --git a/doc/sphinx-guides/source/admin/troubleshooting.rst b/doc/sphinx-guides/source/admin/troubleshooting.rst index 0c752924b30..ec24de245b6 100644 --- a/doc/sphinx-guides/source/admin/troubleshooting.rst +++ b/doc/sphinx-guides/source/admin/troubleshooting.rst @@ -43,6 +43,26 @@ A User Needs Their Account to Be Converted From Institutional (Shibboleth), ORCI See :ref:`converting-shibboleth-users-to-local` and :ref:`converting-oauth-users-to-local`. +.. _troubleshooting-ingest: + +Ingest +------ + +Long-Running Ingest Jobs Have Exhausted System Resources +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Ingest is both CPU- and memory-intensive, and depending on your system resources and the size and format of tabular data files uploaded, may render Dataverse unresponsive or nearly inoperable. It is possible to cancel these jobs by purging the ingest queue. + +``/usr/local/payara5/mq/bin/imqcmd -u admin query dst -t q -n DataverseIngest`` will query the DataverseIngest destination. The password, unless you have changed it, matches the username. + +``/usr/local/payara5/mq/bin/imqcmd -u admin purge dst -t q -n DataverseIngest`` will purge the DataverseIngest queue, and prompt for your confirmation. + +Finally, list destinations to verify that the purge was successful:: + +``/usr/local/payara5/mq/bin/imqcmd -u admin list dst`` + +If you are still running Glassfish, substitute glassfish4 for payara5 above. If you have installed Dataverse in some other location, adjust the above paths accordingly. + .. _troubleshooting-payara: Payara diff --git a/doc/sphinx-guides/source/api/native-api.rst b/doc/sphinx-guides/source/api/native-api.rst index 03e8f5f3f39..3240ee9ebe0 100644 --- a/doc/sphinx-guides/source/api/native-api.rst +++ b/doc/sphinx-guides/source/api/native-api.rst @@ -989,16 +989,16 @@ Note that the dataset citation date field type must be a date field. export API_TOKEN=xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx export SERVER_URL=https://demo.dataverse.org - export ID=24 - export DATASET_FIELD_TYPE_NAME=:dateOfDeposit + export PERSISTENT_IDENTIFIER=doi:10.5072/FK2/J8SJZB + export DATASET_FIELD_TYPE_NAME=dateOfDeposit - curl -H "X-Dataverse-key: $API_TOKEN" -X PUT $SERVER_URL/api/datasets/$ID/citationdate --data "$DATASET_FIELD_TYPE_NAME" + curl -H "X-Dataverse-key: $API_TOKEN" -X PUT $SERVER_URL/api/datasets/:persistentId/citationdate?persistentId=$PERSISTENT_IDENTIFIER --data "$DATASET_FIELD_TYPE_NAME" The fully expanded example above (without environment variables) looks like this: .. code-block:: bash - curl -H "X-Dataverse-key: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" -X PUT https://demo.dataverse.org/api/datasets/24/citationdate --data ":dateOfDeposit" + curl -H "X-Dataverse-key: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" -X PUT https://demo.dataverse.org/api/datasets/:persistentId/citationdate?persistentId=doi:10.5072/FK2/J8SJZB --data "dateOfDeposit" Revert Citation Date Field Type to Default for Dataset ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -1009,15 +1009,15 @@ Restores the default citation date field type, ``:publicationDate``, for a given export API_TOKEN=xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx export SERVER_URL=https://demo.dataverse.org - export ID=24 + export PERSISTENT_IDENTIFIER=doi:10.5072/FK2/J8SJZB - curl -H "X-Dataverse-key: $API_TOKEN" -X DELETE $SERVER_URL/api/datasets/$ID/citationdate + curl -H "X-Dataverse-key: $API_TOKEN" -X DELETE $SERVER_URL/api/datasets/:persistentId/citationdate?persistentId=$PERSISTENT_IDENTIFIER The fully expanded example above (without environment variables) looks like this: .. code-block:: bash - curl -H "X-Dataverse-key: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" -X DELETE https://demo.dataverse.org/api/datasets/24/citationdate + curl -H "X-Dataverse-key: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" -X DELETE https://demo.dataverse.org/api/datasets/:persistentId/citationdate?persistentId=doi:10.5072/FK2/J8SJZB .. _list-roles-on-a-dataset-api: @@ -1654,6 +1654,11 @@ The fully expanded example above (without environment variables) looks like this Calling the destroy endpoint is permanent and irreversible. It will remove the dataset and its datafiles, then re-index the parent dataverse in Solr. This endpoint requires the API token of a superuser. +Configure a Dataset to Use a Specific File Store +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +``/api/datasets/$dataset-id/storageDriver`` can be used to check, configure or reset the designated file store (storage driver) for a dataset. Please see the :doc:`/admin/dataverses-datasets` section of the guide for more information on this API. + Files ----- diff --git a/doc/sphinx-guides/source/conf.py b/doc/sphinx-guides/source/conf.py index 17c68d38468..2cba6ba5491 100755 --- a/doc/sphinx-guides/source/conf.py +++ b/doc/sphinx-guides/source/conf.py @@ -65,9 +65,9 @@ # built documents. # # The short X.Y version. -version = '5.0' +version = '5.1.1' # The full version, including alpha/beta/rc tags. -release = '5.0' +release = '5.1.1' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/doc/sphinx-guides/source/installation/config.rst b/doc/sphinx-guides/source/installation/config.rst index 34da299528f..de8fbad3687 100644 --- a/doc/sphinx-guides/source/installation/config.rst +++ b/doc/sphinx-guides/source/installation/config.rst @@ -516,6 +516,9 @@ By default, your store will use the [default] profile in you .aws configuration ``./asadmin create-jvm-options "-Ddataverse.files..profile="`` +Larger installations may want to increase the number of open S3 connections allowed (default is 256): For example, + +``./asadmin create-jvm-options "-Ddataverse.files..connection-pool-size=4096"`` In case you would like to configure Dataverse to use a custom S3 service instead of Amazon S3 services, please add the options for the custom URL and region as documented below. Please read above if your desired combination has @@ -541,6 +544,7 @@ dataverse.files..custom-endpoint-region Only used when dataverse.files..path-style-access ``true``/``false`` Use path style buckets instead of subdomains. Optional. ``false`` dataverse.files..payload-signing ``true``/``false`` Enable payload signing. Optional ``false`` dataverse.files..chunked-encoding ``true``/``false`` Disable chunked encoding. Optional ``true`` +dataverse.files..connection-pool-size The maximum number of open connections to the S3 server ``256`` =========================================== ================== ========================================================================= ============= Reported Working S3-Compatible Storage @@ -772,6 +776,8 @@ For Google Analytics, the example script at :download:`analytics-code.html `_ serialized `OAI-ORE `_ map file, which is also available as a metadata export format in the Dataverse web interface. -At present, the DPNSubmitToArchiveCommand and LocalSubmitToArchiveCommand are the only implementations extending the AbstractSubmitToArchiveCommand and using the configurable mechanisms discussed below. +At present, the DPNSubmitToArchiveCommand, LocalSubmitToArchiveCommand, and GoogleCloudSubmitToArchive are the only implementations extending the AbstractSubmitToArchiveCommand and using the configurable mechanisms discussed below. .. _Duracloud Configuration: @@ -827,10 +833,41 @@ ArchiverClassName - the fully qualified class to be used for archiving. For exam \:ArchiverSettings - the archiver class can access required settings including existing Dataverse settings and dynamically defined ones specific to the class. This setting is a comma-separated list of those settings. For example\: -``curl http://localhost:8080/api/admin/settings/:ArchiverSettings -X PUT -d ":BagItLocalPathâ€`` +``curl http://localhost:8080/api/admin/settings/:ArchiverSettings -X PUT -d ":BagItLocalPath"`` :BagItLocalPath is the file path that you've set in :ArchiverSettings. +.. _Google Cloud Configuration: + +Google Cloud Configuration +++++++++++++++++++++++++++ + +The Google Cloud Archiver can send Dataverse Bags to a bucket in Google's cloud, including those in the 'Coldline' storage class (cheaper, with slower access) + +``curl http://localhost:8080/api/admin/settings/:ArchiverClassName -X PUT -d "edu.harvard.iq.dataverse.engine.command.impl.GoogleCloudSubmitToArchiveCommand"`` + +``curl http://localhost:8080/api/admin/settings/:ArchiverSettings -X PUT -d ":GoogleCloudBucket, :GoogleCloudProject"`` + +The Google Cloud Archiver defines two custom settings, both are required. The credentials for your account, in the form of a json key file, must also be obtained and stored locally (see below): + +In order to use the Google Cloud Archiver, you must have a Google account. You will need to create a project and bucket within that account and provide those values in the settings: + +\:GoogleCloudBucket - the name of the bucket to use. For example: + +``curl http://localhost:8080/api/admin/settings/:GoogleCloudBucket -X PUT -d "qdr-archive"`` + +\:GoogleCloudProject - the name of the project managing the bucket. For example: + +``curl http://localhost:8080/api/admin/settings/:GoogleCloudProject -X PUT -d "qdr-project"`` + +The Google Cloud Archiver also requires a key file that must be renamed to 'googlecloudkey.json' and placed in the directory identified by your 'dataverse.files.directory' jvm option. This file can be created in the Google Cloud Console. (One method: Navigate to your Project 'Settings'/'Service Accounts', create an account, give this account the 'Cloud Storage'/'Storage Admin' role, and once it's created, use the 'Actions' menu to 'Create Key', selecting the 'JSON' format option. Use this as the 'googlecloudkey.json' file.) + +For example: + +``cp /usr/local/payara5/glassfish/domains/domain1/files/googlecloudkey.json`` + +.. _Archiving API Call: + API Call ++++++++ @@ -2120,3 +2157,40 @@ To enable redirects to the zipper installed on the same server as the main Datav To enable redirects to the zipper on a different server: ``curl -X PUT -d 'https://zipper.example.edu/cgi-bin/zipdownload' http://localhost:8080/api/admin/settings/:CustomZipDownloadServiceUrl`` + +:ArchiverClassName +++++++++++++++++++ + +Dataverse can export archival "Bag" files to an extensible set of storage systems (see :ref:`BagIt Export` above for details about this and for further explanation of the other archiving related settings below). +This setting specifies which storage system to use by identifying the particular Java class that should be run. Current options include DuraCloudSubmitToArchiveCommand, LocalSubmitToArchiveCommand, and GoogleCloudSubmitToArchiveCommand. + +``curl -X PUT -d 'LocalSubmitToArchiveCommand' http://localhost:8080/api/admin/settings/:ArchiverClassName`` + +:ArchiverSettings ++++++++++++++++++ + +Each Archiver class may have its own custom settings. Along with setting which Archiver class to use, one must use this setting to identify which setting values should be sent to it when it is invoked. The value should be a comma-separated list of setting names. +For example, the LocalSubmitToArchiveCommand only uses the :BagItLocalPath setting. To allow the class to use that setting, this setting must set as: + +``curl -X PUT -d ':BagItLocalPath' http://localhost:8080/api/admin/settings/:ArchiverSettings`` + +:DuraCloudHost +++++++++++++++ +:DuraCloudPort +++++++++++++++ +:DuraCloudContext ++++++++++++++++++ + +These three settings define the host, port, and context used by the DuraCloudSubmitToArchiveCommand. :DuraCloudHost is required. The other settings have default values as noted in the :ref:`Duracloud Configuration` section above. + +:BagItLocalPath ++++++++++++++++ + +This is the local file system path to be used with the LocalSubmitToArchiveCommand class. It is recommended to use an absolute path. See the :ref:`Local Path Configuration` section above. + +:GoogleCloudBucket +++++++++++++++++++ +:GoogleCloudProject ++++++++++++++++++++ + +These are the bucket and project names to be used with the GoogleCloudSubmitToArchiveCommand class. Further information is in the :ref:`Google Cloud Configuration` section above. diff --git a/doc/sphinx-guides/source/versions.rst b/doc/sphinx-guides/source/versions.rst index a9f389fde54..0874d04f8ed 100755 --- a/doc/sphinx-guides/source/versions.rst +++ b/doc/sphinx-guides/source/versions.rst @@ -6,8 +6,10 @@ Dataverse Documentation Versions This list provides a way to refer to the documentation for previous versions of Dataverse. In order to learn more about the updates delivered from one version to another, visit the `Releases `__ page in our GitHub repo. -- 5.0 +- 5.1.1 +- `5.1 `__ +- `5.0 `__ - `4.20 `__ - `4.19 `__ - `4.18.1 `__ diff --git a/pom.xml b/pom.xml index 6c9fa99dbc9..792941ed548 100644 --- a/pom.xml +++ b/pom.xml @@ -7,7 +7,7 @@ --> edu.harvard.iq dataverse - 5.0 + 5.1.1 war dataverse @@ -57,7 +57,7 @@ - @@ -127,6 +127,13 @@ httpclient ${httpcomponents.client.version} + + com.google.cloud + google-cloud-bom + 0.115.0-alpha + pom + import + org.testcontainers testcontainers-bom @@ -137,7 +144,7 @@ @@ -440,11 +447,6 @@ slf4j-log4j12 1.6.1 - - axis - axis - 1.4 - io.searchbox jest @@ -573,7 +575,7 @@ org.apache.tika tika-parsers - 1.22 + 1.24.1 @@ -581,6 +583,11 @@ opennlp-tools 1.9.1 + + com.google.cloud + google-cloud-storage + 1.97.0 + diff --git a/scripts/api/data/dataset-create-new-all-default-fields.json b/scripts/api/data/dataset-create-new-all-default-fields.json index ba801b9bae8..7a82cd4bb75 100644 --- a/scripts/api/data/dataset-create-new-all-default-fields.json +++ b/scripts/api/data/dataset-create-new-all-default-fields.json @@ -181,7 +181,7 @@ "typeName": "dsDescriptionValue", "multiple": false, "typeClass": "primitive", - "value": "DescriptionText 1" + "value": "DescriptionText1" }, "dsDescriptionDate": { "typeName": "dsDescriptionDate", @@ -264,6 +264,53 @@ } ] }, + { + "typeName": "topicClassification", + "multiple": true, + "typeClass": "compound", + "value": [ + { + "topicClassValue": { + "typeName": "topicClassValue", + "multiple": false, + "typeClass": "primitive", + "value": "Topic Classification Term1" + }, + "topicClassVocab": { + "typeName": "topicClassVocab", + "multiple": false, + "typeClass": "primitive", + "value": "Topic Classification Vocab1" + }, + "topicClassVocabURI": { + "typeName": "topicClassVocabURI", + "multiple": false, + "typeClass": "primitive", + "value": "https://TopicClassificationURL1.com" + } + }, + { + "topicClassValue": { + "typeName": "topicClassValue", + "multiple": false, + "typeClass": "primitive", + "value": "Topic Classification Term2" + }, + "topicClassVocab": { + "typeName": "topicClassVocab", + "multiple": false, + "typeClass": "primitive", + "value": "Topic Classification Vocab2" + }, + "topicClassVocabURI": { + "typeName": "topicClassVocabURI", + "multiple": false, + "typeClass": "primitive", + "value": "https://TopicClassificationURL2.com" + } + } + ] + }, { "typeName": "publication", "multiple": true, @@ -329,6 +376,15 @@ "typeClass": "primitive", "value": "Notes1" }, + { + "typeName": "language", + "multiple": true, + "typeClass": "controlledVocabulary", + "value": [ + "Abkhaz", + "Afar" + ] + }, { "typeName": "producer", "multiple": true, diff --git a/scripts/api/data/metadatablocks/biomedical.tsv b/scripts/api/data/metadatablocks/biomedical.tsv index f45c5849845..28d59130c34 100644 --- a/scripts/api/data/metadatablocks/biomedical.tsv +++ b/scripts/api/data/metadatablocks/biomedical.tsv @@ -1,295 +1,299 @@ -#metadataBlock name dataverseAlias displayName - biomedical Life Sciences Metadata -#datasetField name title description watermark fieldType displayOrder displayFormat advancedSearchField allowControlledVocabulary allowmultiples facetable displayoncreate required parent metadatablock_id - studyDesignType Design Type Design types that are based on the overall experimental design. text 0 TRUE TRUE TRUE TRUE FALSE FALSE biomedical - studyFactorType Factor Type Factors used in the Dataset. text 1 TRUE TRUE TRUE TRUE FALSE FALSE biomedical - studyAssayOrganism Organism The taxonomic name of the organism used in the Dataset or from which the starting biological material derives. text 2 TRUE TRUE TRUE TRUE FALSE FALSE biomedical - studyAssayOtherOrganism Other Organism If Other was selected in Organism, list any other organisms that were used in this Dataset. Terms from the NCBI Taxonomy are recommended. text 3 TRUE FALSE TRUE TRUE FALSE FALSE biomedical - studyAssayMeasurementType Measurement Type A term to qualify the endpoint, or what is being measured (e.g. gene expression profiling; protein identification). text 4 TRUE TRUE TRUE TRUE FALSE FALSE biomedical - studyAssayOtherMeasurmentType Other Measurement Type If Other was selected in Measurement Type, list any other measurement types that were used. Terms from NCBO Bioportal are recommended. text 5 TRUE FALSE TRUE TRUE FALSE FALSE biomedical - studyAssayTechnologyType Technology Type A term to identify the technology used to perform the measurement (e.g. DNA microarray; mass spectrometry). text 6 TRUE TRUE TRUE TRUE FALSE FALSE biomedical - studyAssayPlatform Technology Platform The manufacturer and name of the technology platform used in the assay (e.g. Bruker AVANCE). text 7 TRUE TRUE TRUE TRUE FALSE FALSE biomedical - studyAssayCellType Cell Type The name of the cell line from which the source or sample derives. text 8 TRUE TRUE TRUE TRUE FALSE FALSE biomedical -#controlledVocabulary DatasetField Value identifier displayOrder - studyDesignType Case Control EFO_0001427 0 - studyDesignType Cross Sectional EFO_0001428 1 - studyDesignType Cohort Study OCRE100078 2 - studyDesignType Nested Case Control Design NCI_C48202 3 - studyDesignType Not Specified OTHER_DESIGN 4 - studyDesignType Parallel Group Design OBI_0500006 5 - studyDesignType Perturbation Design OBI_0001033 6 - studyDesignType Randomized Controlled Trial MESH_D016449 7 - studyDesignType Technological Design TECH_DESIGN 8 - studyFactorType Age EFO_0000246 0 - studyFactorType Biomarkers BIOMARKERS 1 - studyFactorType Cell Surface Markers CELL_SURFACE_M 2 - studyFactorType Cell Type/Cell Line EFO_0000324;EFO_0000322 3 - studyFactorType Developmental Stage EFO_0000399 4 - studyFactorType Disease State OBI_0001293 5 - studyFactorType Drug Susceptibility IDO_0000469 6 - studyFactorType Extract Molecule FBcv_0010001 7 - studyFactorType Genetic Characteristics OBI_0001404 8 - studyFactorType Immunoprecipitation Antibody OBI_0000690 9 - studyFactorType Organism OBI_0100026 10 - studyFactorType Other OTHER_FACTOR 11 - studyFactorType Passages PASSAGES_FACTOR 12 - studyFactorType Platform OBI_0000050 13 - studyFactorType Sex EFO_0000695 14 - studyFactorType Strain EFO_0005135 15 - studyFactorType Time Point EFO_0000724 16 - studyFactorType Tissue Type BTO_0001384 17 - studyFactorType Treatment Compound EFO_0000369 18 - studyFactorType Treatment Type EFO_0000727 19 - studyAssayMeasurementType cell counting ERO_0001899 0 - studyAssayMeasurementType cell sorting CHMO_0001085 1 - studyAssayMeasurementType clinical chemistry analysis OBI_0000520 2 - studyAssayMeasurementType copy number variation profiling OBI_0000537 3 - studyAssayMeasurementType DNA methylation profiling OBI_0000634 4 - studyAssayMeasurementType DNA methylation profiling (Bisulfite-Seq) OBI_0000748 5 - studyAssayMeasurementType DNA methylation profiling (MeDIP-Seq) _OBI_0000634 6 - studyAssayMeasurementType drug susceptibility _IDO_0000469 7 - studyAssayMeasurementType environmental gene survey ENV_GENE_SURVEY 8 - studyAssayMeasurementType genome sequencing ERO_0001183 9 - studyAssayMeasurementType hematology OBI_0000630 10 - studyAssayMeasurementType histology OBI_0600020 11 - studyAssayMeasurementType Histone Modification (ChIP-Seq) OBI_0002017 12 - studyAssayMeasurementType loss of heterozygosity profiling SO_0001786 13 - studyAssayMeasurementType metabolite profiling OBI_0000366 14 - studyAssayMeasurementType metagenome sequencing METAGENOME_SEQ 15 - studyAssayMeasurementType protein expression profiling OBI_0000615 16 - studyAssayMeasurementType protein identification ERO_0000346 17 - studyAssayMeasurementType protein-DNA binding site identification PROTEIN_DNA_BINDING 18 - studyAssayMeasurementType protein-protein interaction detection OBI_0000288 19 - studyAssayMeasurementType protein-RNA binding (RIP-Seq) PROTEIN_RNA_BINDING 20 - studyAssayMeasurementType SNP analysis OBI_0000435 21 - studyAssayMeasurementType targeted sequencing TARGETED_SEQ 22 - studyAssayMeasurementType transcription factor binding (ChIP-Seq) OBI_0002018 23 - studyAssayMeasurementType transcription factor binding site identification OBI_0000291 24 - studyAssayMeasurementType transcription profiling OBI_0000424 25 - studyAssayMeasurementType transcription profiling EFO_0001032 26 - studyAssayMeasurementType transcription profiling (Microarray) TRANSCRIPTION_PROF 27 - studyAssayMeasurementType transcription profiling (RNA-Seq) OBI_0001271 28 - studyAssayMeasurementType TRAP translational profiling TRAP_TRANS_PROF 29 - studyAssayMeasurementType Other OTHER_MEASUREMENT 30 - studyAssayOrganism Arabidopsis thaliana NCBITaxon_3702 0 - studyAssayOrganism Bos taurus NCBITaxon_9913 1 - studyAssayOrganism Caenorhabditis elegans NCBITaxon_6239 2 - studyAssayOrganism Chlamydomonas reinhardtii NCBITaxon_3055 3 - studyAssayOrganism Danio rerio (zebrafish) NCBITaxon_7955 4 - studyAssayOrganism Dictyostelium discoideum NCBITaxon_44689 5 - studyAssayOrganism Drosophila melanogaster NCBITaxon_7227 6 - studyAssayOrganism Escherichia coli NCBITaxon_562 7 - studyAssayOrganism Hepatitis C virus NCBITaxon_11103 8 - studyAssayOrganism Homo sapiens NCBITaxon_9606 9 - studyAssayOrganism Mus musculus NCBITaxon_10090 10 - studyAssayOrganism Mycobacterium africanum NCBITaxon_33894 11 - studyAssayOrganism Mycobacterium canetti NCBITaxon_78331 12 - studyAssayOrganism Mycobacterium tuberculosis NCBITaxon_1773 13 - studyAssayOrganism Mycoplasma pneumoniae NCBITaxon_2104 14 - studyAssayOrganism Oryza sativa NCBITaxon_4530 15 - studyAssayOrganism Plasmodium falciparum NCBITaxon_5833 16 - studyAssayOrganism Pneumocystis carinii NCBITaxon_4754 17 - studyAssayOrganism Rattus norvegicus NCBITaxon_10116 18 - studyAssayOrganism Saccharomyces cerevisiae (brewer's yeast) NCBITaxon_4932 19 - studyAssayOrganism Schizosaccharomyces pombe NCBITaxon_4896 20 - studyAssayOrganism Takifugu rubripes NCBITaxon_31033 21 - studyAssayOrganism Xenopus laevis NCBITaxon_8355 22 - studyAssayOrganism Zea mays NCBITaxon_4577 23 - studyAssayOrganism Other OTHER_TAXONOMY 24 - studyAssayTechnologyType culture based drug susceptibility testing, single concentration CULTURE_DRUG_TEST_SINGLE 0 - studyAssayTechnologyType culture based drug susceptibility testing, two concentrations CULTURE_DRUG_TEST_TWO 1 - studyAssayTechnologyType culture based drug susceptibility testing, three or more concentrations (minimium inhibitory concentration measurement) CULTURE_DRUG_TEST_THREE 2 - studyAssayTechnologyType DNA microarray OBI_0400148 3 - studyAssayTechnologyType flow cytometry OBI_0000916 4 - studyAssayTechnologyType gel electrophoresis OBI_0600053 5 - studyAssayTechnologyType mass spectrometry OBI_0000470 6 - studyAssayTechnologyType NMR spectroscopy OBI_0000623 7 - studyAssayTechnologyType nucleotide sequencing OBI_0000626 8 - studyAssayTechnologyType protein microarray OBI_0400149 9 - studyAssayTechnologyType real time PCR OBI_0000893 10 - studyAssayTechnologyType no technology required NO_TECHNOLOGY 11 - studyAssayTechnologyType Other OTHER_TECHNOLOGY 12 - studyAssayPlatform 210-MS GC Ion Trap (Varian) 210_MS_GC 0 - studyAssayPlatform 220-MS GC Ion Trap (Varian) 220_MS_GC 1 - studyAssayPlatform 225-MS GC Ion Trap (Varian) 225_MS_GC 2 - studyAssayPlatform 240-MS GC Ion Trap (Varian) 240_MS_GC 3 - studyAssayPlatform 300-MS quadrupole GC/MS (Varian) 300_MS_GCMS 4 - studyAssayPlatform 320-MS LC/MS (Varian) 320_MS_LCMS 5 - studyAssayPlatform 325-MS LC/MS (Varian) 325_MS_LCMS 6 - studyAssayPlatform 320-MS GC/MS (Varian) 500_MS_GCMS 7 - studyAssayPlatform 500-MS LC/MS (Varian) 500_MS_LCMS 8 - studyAssayPlatform 800D (Jeol) 800D 9 - studyAssayPlatform 910-MS TQ-FT (Varian) 910_MS_TQFT 10 - studyAssayPlatform 920-MS TQ-FT (Varian) 920_MS_TQFT 11 - studyAssayPlatform 3100 Mass Detector (Waters) 3100_MASS_D 12 - studyAssayPlatform 6110 Quadrupole LC/MS (Agilent) 6110_QUAD_LCMS 13 - studyAssayPlatform 6120 Quadrupole LC/MS (Agilent) 6120_QUAD_LCMS 14 - studyAssayPlatform 6130 Quadrupole LC/MS (Agilent) 6130_QUAD_LCMS 15 - studyAssayPlatform 6140 Quadrupole LC/MS (Agilent) 6140_QUAD_LCMS 16 - studyAssayPlatform 6310 Ion Trap LC/MS (Agilent) 6310_ION_LCMS 17 - studyAssayPlatform 6320 Ion Trap LC/MS (Agilent) 6320_ION_LCMS 18 - studyAssayPlatform 6330 Ion Trap LC/MS (Agilent) 6330_ION_LCMS 19 - studyAssayPlatform 6340 Ion Trap LC/MS (Agilent) 6340_ION_LCMS 20 - studyAssayPlatform 6410 Triple Quadrupole LC/MS (Agilent) 6410_TRIPLE_LCMS 21 - studyAssayPlatform 6430 Triple Quadrupole LC/MS (Agilent) 6430_TRIPLE_LCMS 22 - studyAssayPlatform 6460 Triple Quadrupole LC/MS (Agilent) 6460_TRIPLE_LCMS 23 - studyAssayPlatform 6490 Triple Quadrupole LC/MS (Agilent) 6490_TRIPLE_LCMS 24 - studyAssayPlatform 6530 Q-TOF LC/MS (Agilent) 6530_Q_TOF_LCMS 25 - studyAssayPlatform 6540 Q-TOF LC/MS (Agilent) 6540_Q_TOF_LCMS 26 - studyAssayPlatform 6210 TOF LC/MS (Agilent) 6210_Q_TOF_LCMS 27 - studyAssayPlatform 6220 TOF LC/MS (Agilent) 6220_Q_TOF_LCMS 28 - studyAssayPlatform 6230 TOF LC/MS (Agilent) 6230_Q_TOF_LCMS 29 - studyAssayPlatform 7000B Triple Quadrupole GC/MS (Agilent) 700B_TRIPLE_GCMS 30 - studyAssayPlatform AccuTO DART (Jeol) ACCUTO_DART 31 - studyAssayPlatform AccuTOF GC (Jeol) ACCUTOF_GC 32 - studyAssayPlatform AccuTOF LC (Jeol) ACCUTOF_LC 33 - studyAssayPlatform ACQUITY SQD (Waters) ACQUITY_SQD 34 - studyAssayPlatform ACQUITY TQD (Waters) ACQUITY_TQD 35 - studyAssayPlatform Agilent AGILENT 36 - studyAssayPlatform Agilent 5975E GC/MSD (Agilent) AGILENT_ 5975E_GCMSD 37 - studyAssayPlatform Agilent 5975T LTM GC/MSD (Agilent) AGILENT_5975T_LTM_GCMSD 38 - studyAssayPlatform 5975C Series GC/MSD (Agilent) 5975C_GCMSD 39 - studyAssayPlatform Affymetrix AFFYMETRIX 40 - studyAssayPlatform amaZon ETD ESI Ion Trap (Bruker) AMAZON_ETD_ESI 41 - studyAssayPlatform amaZon X ESI Ion Trap (Bruker) AMAZON_X_ESI 42 - studyAssayPlatform apex-ultra hybrid Qq-FTMS (Bruker) APEX_ULTRA_QQ_FTMS 43 - studyAssayPlatform API 2000 (AB Sciex) API_2000 44 - studyAssayPlatform API 3200 (AB Sciex) API_3200 45 - studyAssayPlatform API 3200 QTRAP (AB Sciex) API_3200_QTRAP 46 - studyAssayPlatform API 4000 (AB Sciex) API_4000 47 - studyAssayPlatform API 4000 QTRAP (AB Sciex) API_4000_QTRAP 48 - studyAssayPlatform API 5000 (AB Sciex) API_5000 49 - studyAssayPlatform API 5500 (AB Sciex) API_5500 50 - studyAssayPlatform API 5500 QTRAP (AB Sciex) API_5500_QTRAP 51 - studyAssayPlatform Applied Biosystems Group (ABI) APPLIED_BIOSYSTEMS 52 - studyAssayPlatform AQI Biosciences AQI_BIOSCIENCES 53 - studyAssayPlatform Atmospheric Pressure GC (Waters) ATMOS_GC 54 - studyAssayPlatform autoflex III MALDI-TOF MS (Bruker) AUTOFLEX_III_MALDI_TOF_MS 55 - studyAssayPlatform autoflex speed(Bruker) AUTOFLEX_SPEED 56 - studyAssayPlatform AutoSpec Premier (Waters) AUTOSPEC_PREMIER 57 - studyAssayPlatform AXIMA Mega TOF (Shimadzu) AXIMA_MEGA_TOF 58 - studyAssayPlatform AXIMA Performance MALDI TOF/TOF (Shimadzu) AXIMA_PERF_MALDI_TOF 59 - studyAssayPlatform A-10 Analyzer (Apogee) A_10_ANALYZER 60 - studyAssayPlatform A-40-MiniFCM (Apogee) A_40_MINIFCM 61 - studyAssayPlatform Bactiflow (Chemunex SA) BACTIFLOW 62 - studyAssayPlatform Base4innovation BASE4INNOVATION 63 - studyAssayPlatform BD BACTEC MGIT 320 BD_BACTEC_MGIT_320 64 - studyAssayPlatform BD BACTEC MGIT 960 BD_BACTEC_MGIT_960 65 - studyAssayPlatform BD Radiometric BACTEC 460TB BD_RADIO_BACTEC_460TB 66 - studyAssayPlatform BioNanomatrix BIONANOMATRIX 67 - studyAssayPlatform Cell Lab Quanta SC (Becman Coulter) CELL_LAB_QUANTA_SC 68 - studyAssayPlatform Clarus 560 D GC/MS (PerkinElmer) CLARUS_560_D_GCMS 69 - studyAssayPlatform Clarus 560 S GC/MS (PerkinElmer) CLARUS_560_S_GCMS 70 - studyAssayPlatform Clarus 600 GC/MS (PerkinElmer) CLARUS_600_GCMS 71 - studyAssayPlatform Complete Genomics COMPLETE_GENOMICS 72 - studyAssayPlatform Cyan (Dako Cytomation) CYAN 73 - studyAssayPlatform CyFlow ML (Partec) CYFLOW_ML 74 - studyAssayPlatform Cyow SL (Partec) CYFLOW_SL 75 - studyAssayPlatform CyFlow SL3 (Partec) CYFLOW_SL3 76 - studyAssayPlatform CytoBuoy (Cyto Buoy Inc) CYTOBUOY 77 - studyAssayPlatform CytoSence (Cyto Buoy Inc) CYTOSENCE 78 - studyAssayPlatform CytoSub (Cyto Buoy Inc) CYTOSUB 79 - studyAssayPlatform Danaher DANAHER 80 - studyAssayPlatform DFS (Thermo Scientific) DFS 81 - studyAssayPlatform Exactive(Thermo Scientific) EXACTIVE 82 - studyAssayPlatform FACS Canto (Becton Dickinson) FACS_CANTO 83 - studyAssayPlatform FACS Canto2 (Becton Dickinson) FACS_CANTO2 84 - studyAssayPlatform FACS Scan (Becton Dickinson) FACS_SCAN 85 - studyAssayPlatform FC 500 (Becman Coulter) FC_500 86 - studyAssayPlatform GCmate II GC/MS (Jeol) GCMATE_II 87 - studyAssayPlatform GCMS-QP2010 Plus (Shimadzu) GCMS_QP2010_PLUS 88 - studyAssayPlatform GCMS-QP2010S Plus (Shimadzu) GCMS_QP2010S_PLUS 89 - studyAssayPlatform GCT Premier (Waters) GCT_PREMIER 90 - studyAssayPlatform GENEQ GENEQ 91 - studyAssayPlatform Genome Corp. GENOME_CORP 92 - studyAssayPlatform GenoVoxx GENOVOXX 93 - studyAssayPlatform GnuBio GNUBIO 94 - studyAssayPlatform Guava EasyCyte Mini (Millipore) GUAVA_EASYCYTE_MINI 95 - studyAssayPlatform Guava EasyCyte Plus (Millipore) GUAVA_EASYCYTE_PLUS 96 - studyAssayPlatform Guava Personal Cell Analysis (Millipore) GUAVA_PERSONAL_CELL 97 - studyAssayPlatform Guava Personal Cell Analysis-96 (Millipore) GUAVA_PERSONAL_CELL_96 98 - studyAssayPlatform Helicos BioSciences HELICOS_BIO 99 - studyAssayPlatform Illumina ILLUMINA 100 - studyAssayPlatform Indirect proportion method on LJ medium INDIRECT_LJ_MEDIUM 101 - studyAssayPlatform Indirect proportion method on Middlebrook Agar 7H9 INDIRECT_AGAR_7H9 102 - studyAssayPlatform Indirect proportion method on Middlebrook Agar 7H10 INDIRECT_AGAR_7H10 103 - studyAssayPlatform Indirect proportion method on Middlebrook Agar 7H11 INDIRECT_AGAR_7H11 104 - studyAssayPlatform inFlux Analyzer (Cytopeia) INFLUX_ANALYZER 105 - studyAssayPlatform Intelligent Bio-Systems INTELLIGENT_BIOSYSTEMS 106 - studyAssayPlatform ITQ 700 (Thermo Scientific) ITQ_700 107 - studyAssayPlatform ITQ 900 (Thermo Scientific) ITQ_900 108 - studyAssayPlatform ITQ 1100 (Thermo Scientific) ITQ_1100 109 - studyAssayPlatform JMS-53000 SpiralTOF (Jeol) JMS_53000_SPIRAL 110 - studyAssayPlatform LaserGen LASERGEN 111 - studyAssayPlatform LCMS-2020 (Shimadzu) LCMS_2020 112 - studyAssayPlatform LCMS-2010EV (Shimadzu) LCMS_2010EV 113 - studyAssayPlatform LCMS-IT-TOF (Shimadzu) LCMS_IT_TOF 114 - studyAssayPlatform Li-Cor LI_COR 115 - studyAssayPlatform Life Tech LIFE_TECH 116 - studyAssayPlatform LightSpeed Genomics LIGHTSPEED_GENOMICS 117 - studyAssayPlatform LCT Premier XE (Waters) LCT_PREMIER_XE 118 - studyAssayPlatform LCQ Deca XP MAX (Thermo Scientific) LCQ_DECA_XP_MAX 119 - studyAssayPlatform LCQ Fleet (Thermo Scientific) LCQ_FLEET 120 - studyAssayPlatform LXQ (Thermo Scientific) LXQ_THERMO 121 - studyAssayPlatform LTQ Classic (Thermo Scientific) LTQ_CLASSIC 122 - studyAssayPlatform LTQ XL (Thermo Scientific) LTQ_XL 123 - studyAssayPlatform LTQ Velos (Thermo Scientific) LTQ_VELOS 124 - studyAssayPlatform LTQ Orbitrap Classic (Thermo Scientific) LTQ_ORBITRAP_CLASSIC 125 - studyAssayPlatform LTQ Orbitrap XL (Thermo Scientific) LTQ_ORBITRAP_XL 126 - studyAssayPlatform LTQ Orbitrap Discovery (Thermo Scientific) LTQ_ORBITRAP_DISCOVERY 127 - studyAssayPlatform LTQ Orbitrap Velos (Thermo Scientific) LTQ_ORBITRAP_VELOS 128 - studyAssayPlatform Luminex 100 (Luminex) LUMINEX_100 129 - studyAssayPlatform Luminex 200 (Luminex) LUMINEX_200 130 - studyAssayPlatform MACS Quant (Miltenyi) MACS_QUANT 131 - studyAssayPlatform MALDI SYNAPT G2 HDMS (Waters) MALDI_SYNAPT_G2_HDMS 132 - studyAssayPlatform MALDI SYNAPT G2 MS (Waters) MALDI_SYNAPT_G2_MS 133 - studyAssayPlatform MALDI SYNAPT HDMS (Waters) MALDI_SYNAPT_HDMS 134 - studyAssayPlatform MALDI SYNAPT MS (Waters) MALDI_SYNAPT_MS 135 - studyAssayPlatform MALDI micro MX (Waters) MALDI_MICROMX 136 - studyAssayPlatform maXis (Bruker) MAXIS 137 - studyAssayPlatform maXis G4 (Bruker) MAXISG4 138 - studyAssayPlatform microflex LT MALDI-TOF MS (Bruker) MICROFLEX_LT_MALDI_TOF_MS 139 - studyAssayPlatform microflex LRF MALDI-TOF MS (Bruker) MICROFLEX_LRF_MALDI_TOF_MS 140 - studyAssayPlatform microflex III MALDI-TOF MS (Bruker) MICROFLEX_III_TOF_MS 141 - studyAssayPlatform micrOTOF II ESI TOF (Bruker) MICROTOF_II_ESI_TOF 142 - studyAssayPlatform micrOTOF-Q II ESI-Qq-TOF (Bruker) MICROTOF_Q_II_ESI_QQ_TOF 143 - studyAssayPlatform microplate Alamar Blue (resazurin) colorimetric method MICROPLATE_ALAMAR_BLUE_COLORIMETRIC 144 - studyAssayPlatform Mstation (Jeol) MSTATION 145 - studyAssayPlatform MSQ Plus (Thermo Scientific) MSQ_PLUS 146 - studyAssayPlatform NABsys NABSYS 147 - studyAssayPlatform Nanophotonics Biosciences NANOPHOTONICS_BIOSCIENCES 148 - studyAssayPlatform Network Biosystems NETWORK_BIOSYSTEMS 149 - studyAssayPlatform Nimblegen NIMBLEGEN 150 - studyAssayPlatform Oxford Nanopore Technologies OXFORD_NANOPORE_TECHNOLOGIES 151 - studyAssayPlatform Pacific Biosciences PACIFIC_BIOSCIENCES 152 - studyAssayPlatform Population Genetics Technologies POPULATION_GENETICS_TECHNOLOGIES 153 - studyAssayPlatform Q1000GC UltraQuad (Jeol) Q1000GC_ULTRAQUAD 154 - studyAssayPlatform Quattro micro API (Waters) QUATTRO_MICRO_API 155 - studyAssayPlatform Quattro micro GC (Waters) QUATTRO_MICRO_GC 156 - studyAssayPlatform Quattro Premier XE (Waters) QUATTRO_PREMIER_XE 157 - studyAssayPlatform QSTAR (AB Sciex) QSTAR 158 - studyAssayPlatform Reveo REVEO 159 - studyAssayPlatform Roche ROCHE 160 - studyAssayPlatform Seirad SEIRAD 161 - studyAssayPlatform solariX hybrid Qq-FTMS (Bruker) SOLARIX_HYBRID_QQ_FTMS 162 - studyAssayPlatform Somacount (Bently Instruments) SOMACOUNT 163 - studyAssayPlatform SomaScope (Bently Instruments) SOMASCOPE 164 - studyAssayPlatform SYNAPT G2 HDMS (Waters) SYNAPT_G2_HDMS 165 - studyAssayPlatform SYNAPT G2 MS (Waters) SYNAPT_G2_MS 166 - studyAssayPlatform SYNAPT HDMS (Waters) SYNAPT_HDMS 167 - studyAssayPlatform SYNAPT MS (Waters) SYNAPT_MS 168 - studyAssayPlatform TripleTOF 5600 (AB Sciex) TRIPLETOF_5600 169 - studyAssayPlatform TSQ Quantum Ultra (Thermo Scientific) TSQ_QUANTUM_ULTRA 170 - studyAssayPlatform TSQ Quantum Access (Thermo Scientific) TSQ_QUANTUM_ACCESS 171 - studyAssayPlatform TSQ Quantum Access MAX (Thermo Scientific) TSQ_QUANTUM_ACCESS_MAX 172 - studyAssayPlatform TSQ Quantum Discovery MAX (Thermo Scientific) TSQ_QUANTUM_DISCOVERY_MAX 173 - studyAssayPlatform TSQ Quantum GC (Thermo Scientific) TSQ_QUANTUM_GC 174 - studyAssayPlatform TSQ Quantum XLS (Thermo Scientific) TSQ_QUANTUM_XLS 175 - studyAssayPlatform TSQ Vantage (Thermo Scientific) TSQ_VANTAGE 176 - studyAssayPlatform ultrafleXtreme MALDI-TOF MS (Bruker) ULTRAFLEXTREME_MALDI_TOF_MS 177 - studyAssayPlatform VisiGen Biotechnologies VISIGEN_BIO 178 - studyAssayPlatform Xevo G2 QTOF (Waters) XEVO_G2_QTOF 179 - studyAssayPlatform Xevo QTof MS (Waters) XEVO_QTOF_MS 180 - studyAssayPlatform Xevo TQ MS (Waters) XEVO_TQ_MS 181 - studyAssayPlatform Xevo TQ-S (Waters) XEVO_TQ_S 182 +#metadataBlock name dataverseAlias displayName + biomedical Life Sciences Metadata +#datasetField name title description watermark fieldType displayOrder displayFormat advancedSearchField allowControlledVocabulary allowmultiples facetable displayoncreate required parent metadatablock_id + studyDesignType Design Type Design types that are based on the overall experimental design. text 0 TRUE TRUE TRUE TRUE FALSE FALSE biomedical + studyOtherDesignType Other Design Type If Other was selected in Design Type, list any other design types that were used in this Dataset. text 1 TRUE FALSE TRUE TRUE FALSE FALSE biomedical + studyFactorType Factor Type Factors used in the Dataset. text 2 TRUE TRUE TRUE TRUE FALSE FALSE biomedical + studyOtherFactorType Other Factor Type If Other was selected in Factor Type, list any other factor types that were used in this Dataset. text 3 TRUE FALSE TRUE TRUE FALSE FALSE biomedical + studyAssayOrganism Organism The taxonomic name of the organism used in the Dataset or from which the starting biological material derives. text 4 TRUE TRUE TRUE TRUE FALSE FALSE biomedical + studyAssayOtherOrganism Other Organism If Other was selected in Organism, list any other organisms that were used in this Dataset. Terms from the NCBI Taxonomy are recommended. text 5 TRUE FALSE TRUE TRUE FALSE FALSE biomedical + studyAssayMeasurementType Measurement Type A term to qualify the endpoint, or what is being measured (e.g. gene expression profiling; protein identification). text 6 TRUE TRUE TRUE TRUE FALSE FALSE biomedical + studyAssayOtherMeasurmentType Other Measurement Type If Other was selected in Measurement Type, list any other measurement types that were used. Terms from NCBO Bioportal are recommended. text 7 TRUE FALSE TRUE TRUE FALSE FALSE biomedical + studyAssayTechnologyType Technology Type A term to identify the technology used to perform the measurement (e.g. DNA microarray; mass spectrometry). text 8 TRUE TRUE TRUE TRUE FALSE FALSE biomedical + studyAssayOtherTechnologyType Other Technology Type If Other was selected in Technology Type, list any other technology types that were used in this Dataset. text 9 TRUE FALSE TRUE TRUE FALSE FALSE biomedical + studyAssayPlatform Technology Platform The manufacturer and name of the technology platform used in the assay (e.g. Bruker AVANCE). text 10 TRUE TRUE TRUE TRUE FALSE FALSE biomedical + studyAssayOtherPlatform Other Technology Platform If Other was selected in Technology Platform, list any other technology platforms that were used in this Dataset. text 11 TRUE FALSE TRUE TRUE FALSE FALSE biomedical + studyAssayCellType Cell Type The name of the cell line from which the source or sample derives. text 12 TRUE TRUE TRUE TRUE FALSE FALSE biomedical +#controlledVocabulary DatasetField Value identifier displayOrder + studyDesignType Case Control EFO_0001427 0 + studyDesignType Cross Sectional EFO_0001428 1 + studyDesignType Cohort Study OCRE100078 2 + studyDesignType Nested Case Control Design NCI_C48202 3 + studyDesignType Not Specified NOT_SPECIFIED 4 + studyDesignType Parallel Group Design OBI_0500006 5 + studyDesignType Perturbation Design OBI_0001033 6 + studyDesignType Randomized Controlled Trial MESH_D016449 7 + studyDesignType Technological Design TECH_DESIGN 8 + studyDesignType Other OTHER_DESIGN 9 + studyFactorType Age EFO_0000246 0 + studyFactorType Biomarkers BIOMARKERS 1 + studyFactorType Cell Surface Markers CELL_SURFACE_M 2 + studyFactorType Cell Type/Cell Line EFO_0000324;EFO_0000322 3 + studyFactorType Developmental Stage EFO_0000399 4 + studyFactorType Disease State OBI_0001293 5 + studyFactorType Drug Susceptibility IDO_0000469 6 + studyFactorType Extract Molecule FBcv_0010001 7 + studyFactorType Genetic Characteristics OBI_0001404 8 + studyFactorType Immunoprecipitation Antibody OBI_0000690 9 + studyFactorType Organism OBI_0100026 10 + studyFactorType Passages PASSAGES_FACTOR 11 + studyFactorType Platform OBI_0000050 12 + studyFactorType Sex EFO_0000695 13 + studyFactorType Strain EFO_0005135 14 + studyFactorType Time Point EFO_0000724 15 + studyFactorType Tissue Type BTO_0001384 16 + studyFactorType Treatment Compound EFO_0000369 17 + studyFactorType Treatment Type EFO_0000727 18 + studyFactorType Other OTHER_FACTOR 19 + studyAssayMeasurementType cell sorting CHMO_0001085 1 + studyAssayMeasurementType clinical chemistry analysis OBI_0000520 2 + studyAssayMeasurementType copy number variation profiling OBI_0000537 3 + studyAssayMeasurementType DNA methylation profiling OBI_0000634 4 + studyAssayMeasurementType DNA methylation profiling (Bisulfite-Seq) OBI_0000748 5 + studyAssayMeasurementType DNA methylation profiling (MeDIP-Seq) _OBI_0000634 6 + studyAssayMeasurementType drug susceptibility _IDO_0000469 7 + studyAssayMeasurementType environmental gene survey ENV_GENE_SURVEY 8 + studyAssayMeasurementType genome sequencing ERO_0001183 9 + studyAssayMeasurementType hematology OBI_0000630 10 + studyAssayMeasurementType histology OBI_0600020 11 + studyAssayMeasurementType Histone Modification (ChIP-Seq) OBI_0002017 12 + studyAssayMeasurementType loss of heterozygosity profiling SO_0001786 13 + studyAssayMeasurementType metabolite profiling OBI_0000366 14 + studyAssayMeasurementType metagenome sequencing METAGENOME_SEQ 15 + studyAssayMeasurementType protein expression profiling OBI_0000615 16 + studyAssayMeasurementType protein identification ERO_0000346 17 + studyAssayMeasurementType protein-DNA binding site identification PROTEIN_DNA_BINDING 18 + studyAssayMeasurementType protein-protein interaction detection OBI_0000288 19 + studyAssayMeasurementType protein-RNA binding (RIP-Seq) PROTEIN_RNA_BINDING 20 + studyAssayMeasurementType SNP analysis OBI_0000435 21 + studyAssayMeasurementType targeted sequencing TARGETED_SEQ 22 + studyAssayMeasurementType transcription factor binding (ChIP-Seq) OBI_0002018 23 + studyAssayMeasurementType transcription factor binding site identification OBI_0000291 24 + studyAssayMeasurementType transcription profiling OBI_0000424 25 + studyAssayMeasurementType transcription profiling EFO_0001032 26 + studyAssayMeasurementType transcription profiling (Microarray) TRANSCRIPTION_PROF 27 + studyAssayMeasurementType transcription profiling (RNA-Seq) OBI_0001271 28 + studyAssayMeasurementType TRAP translational profiling TRAP_TRANS_PROF 29 + studyAssayMeasurementType Other OTHER_MEASUREMENT 30 + studyAssayOrganism Arabidopsis thaliana NCBITaxon_3702 0 + studyAssayOrganism Bos taurus NCBITaxon_9913 1 + studyAssayOrganism Caenorhabditis elegans NCBITaxon_6239 2 + studyAssayOrganism Chlamydomonas reinhardtii NCBITaxon_3055 3 + studyAssayOrganism Danio rerio (zebrafish) NCBITaxon_7955 4 + studyAssayOrganism Dictyostelium discoideum NCBITaxon_44689 5 + studyAssayOrganism Drosophila melanogaster NCBITaxon_7227 6 + studyAssayOrganism Escherichia coli NCBITaxon_562 7 + studyAssayOrganism Hepatitis C virus NCBITaxon_11103 8 + studyAssayOrganism Homo sapiens NCBITaxon_9606 9 + studyAssayOrganism Mus musculus NCBITaxon_10090 10 + studyAssayOrganism Mycobacterium africanum NCBITaxon_33894 11 + studyAssayOrganism Mycobacterium canetti NCBITaxon_78331 12 + studyAssayOrganism Mycobacterium tuberculosis NCBITaxon_1773 13 + studyAssayOrganism Mycoplasma pneumoniae NCBITaxon_2104 14 + studyAssayOrganism Oryza sativa NCBITaxon_4530 15 + studyAssayOrganism Plasmodium falciparum NCBITaxon_5833 16 + studyAssayOrganism Pneumocystis carinii NCBITaxon_4754 17 + studyAssayOrganism Rattus norvegicus NCBITaxon_10116 18 + studyAssayOrganism Saccharomyces cerevisiae (brewer's yeast) NCBITaxon_4932 19 + studyAssayOrganism Schizosaccharomyces pombe NCBITaxon_4896 20 + studyAssayOrganism Takifugu rubripes NCBITaxon_31033 21 + studyAssayOrganism Xenopus laevis NCBITaxon_8355 22 + studyAssayOrganism Zea mays NCBITaxon_4577 23 + studyAssayOrganism Other OTHER_TAXONOMY 24 + studyAssayTechnologyType culture based drug susceptibility testing, single concentration CULTURE_DRUG_TEST_SINGLE 0 + studyAssayTechnologyType culture based drug susceptibility testing, two concentrations CULTURE_DRUG_TEST_TWO 1 + studyAssayTechnologyType culture based drug susceptibility testing, three or more concentrations (minimium inhibitory concentration measurement) CULTURE_DRUG_TEST_THREE 2 + studyAssayTechnologyType DNA microarray OBI_0400148 3 + studyAssayTechnologyType flow cytometry OBI_0000916 4 + studyAssayTechnologyType gel electrophoresis OBI_0600053 5 + studyAssayTechnologyType mass spectrometry OBI_0000470 6 + studyAssayTechnologyType NMR spectroscopy OBI_0000623 7 + studyAssayTechnologyType nucleotide sequencing OBI_0000626 8 + studyAssayTechnologyType protein microarray OBI_0400149 9 + studyAssayTechnologyType real time PCR OBI_0000893 10 + studyAssayTechnologyType no technology required NO_TECHNOLOGY 11 + studyAssayTechnologyType Other OTHER_TECHNOLOGY 12 + studyAssayPlatform 210-MS GC Ion Trap (Varian) 210_MS_GC 0 + studyAssayPlatform 220-MS GC Ion Trap (Varian) 220_MS_GC 1 + studyAssayPlatform 225-MS GC Ion Trap (Varian) 225_MS_GC 2 + studyAssayPlatform 240-MS GC Ion Trap (Varian) 240_MS_GC 3 + studyAssayPlatform 300-MS quadrupole GC/MS (Varian) 300_MS_GCMS 4 + studyAssayPlatform 320-MS LC/MS (Varian) 320_MS_LCMS 5 + studyAssayPlatform 325-MS LC/MS (Varian) 325_MS_LCMS 6 + studyAssayPlatform 320-MS GC/MS (Varian) 500_MS_GCMS 7 + studyAssayPlatform 500-MS LC/MS (Varian) 500_MS_LCMS 8 + studyAssayPlatform 800D (Jeol) 800D 9 + studyAssayPlatform 910-MS TQ-FT (Varian) 910_MS_TQFT 10 + studyAssayPlatform 920-MS TQ-FT (Varian) 920_MS_TQFT 11 + studyAssayPlatform 3100 Mass Detector (Waters) 3100_MASS_D 12 + studyAssayPlatform 6110 Quadrupole LC/MS (Agilent) 6110_QUAD_LCMS 13 + studyAssayPlatform 6120 Quadrupole LC/MS (Agilent) 6120_QUAD_LCMS 14 + studyAssayPlatform 6130 Quadrupole LC/MS (Agilent) 6130_QUAD_LCMS 15 + studyAssayPlatform 6140 Quadrupole LC/MS (Agilent) 6140_QUAD_LCMS 16 + studyAssayPlatform 6310 Ion Trap LC/MS (Agilent) 6310_ION_LCMS 17 + studyAssayPlatform 6320 Ion Trap LC/MS (Agilent) 6320_ION_LCMS 18 + studyAssayPlatform 6330 Ion Trap LC/MS (Agilent) 6330_ION_LCMS 19 + studyAssayPlatform 6340 Ion Trap LC/MS (Agilent) 6340_ION_LCMS 20 + studyAssayPlatform 6410 Triple Quadrupole LC/MS (Agilent) 6410_TRIPLE_LCMS 21 + studyAssayPlatform 6430 Triple Quadrupole LC/MS (Agilent) 6430_TRIPLE_LCMS 22 + studyAssayPlatform 6460 Triple Quadrupole LC/MS (Agilent) 6460_TRIPLE_LCMS 23 + studyAssayPlatform 6490 Triple Quadrupole LC/MS (Agilent) 6490_TRIPLE_LCMS 24 + studyAssayPlatform 6530 Q-TOF LC/MS (Agilent) 6530_Q_TOF_LCMS 25 + studyAssayPlatform 6540 Q-TOF LC/MS (Agilent) 6540_Q_TOF_LCMS 26 + studyAssayPlatform 6210 TOF LC/MS (Agilent) 6210_Q_TOF_LCMS 27 + studyAssayPlatform 6220 TOF LC/MS (Agilent) 6220_Q_TOF_LCMS 28 + studyAssayPlatform 6230 TOF LC/MS (Agilent) 6230_Q_TOF_LCMS 29 + studyAssayPlatform 7000B Triple Quadrupole GC/MS (Agilent) 700B_TRIPLE_GCMS 30 + studyAssayPlatform AccuTO DART (Jeol) ACCUTO_DART 31 + studyAssayPlatform AccuTOF GC (Jeol) ACCUTOF_GC 32 + studyAssayPlatform AccuTOF LC (Jeol) ACCUTOF_LC 33 + studyAssayPlatform ACQUITY SQD (Waters) ACQUITY_SQD 34 + studyAssayPlatform ACQUITY TQD (Waters) ACQUITY_TQD 35 + studyAssayPlatform Agilent AGILENT 36 + studyAssayPlatform Agilent 5975E GC/MSD (Agilent) AGILENT_ 5975E_GCMSD 37 + studyAssayPlatform Agilent 5975T LTM GC/MSD (Agilent) AGILENT_5975T_LTM_GCMSD 38 + studyAssayPlatform 5975C Series GC/MSD (Agilent) 5975C_GCMSD 39 + studyAssayPlatform Affymetrix AFFYMETRIX 40 + studyAssayPlatform amaZon ETD ESI Ion Trap (Bruker) AMAZON_ETD_ESI 41 + studyAssayPlatform amaZon X ESI Ion Trap (Bruker) AMAZON_X_ESI 42 + studyAssayPlatform apex-ultra hybrid Qq-FTMS (Bruker) APEX_ULTRA_QQ_FTMS 43 + studyAssayPlatform API 2000 (AB Sciex) API_2000 44 + studyAssayPlatform API 3200 (AB Sciex) API_3200 45 + studyAssayPlatform API 3200 QTRAP (AB Sciex) API_3200_QTRAP 46 + studyAssayPlatform API 4000 (AB Sciex) API_4000 47 + studyAssayPlatform API 4000 QTRAP (AB Sciex) API_4000_QTRAP 48 + studyAssayPlatform API 5000 (AB Sciex) API_5000 49 + studyAssayPlatform API 5500 (AB Sciex) API_5500 50 + studyAssayPlatform API 5500 QTRAP (AB Sciex) API_5500_QTRAP 51 + studyAssayPlatform Applied Biosystems Group (ABI) APPLIED_BIOSYSTEMS 52 + studyAssayPlatform AQI Biosciences AQI_BIOSCIENCES 53 + studyAssayPlatform Atmospheric Pressure GC (Waters) ATMOS_GC 54 + studyAssayPlatform autoflex III MALDI-TOF MS (Bruker) AUTOFLEX_III_MALDI_TOF_MS 55 + studyAssayPlatform autoflex speed(Bruker) AUTOFLEX_SPEED 56 + studyAssayPlatform AutoSpec Premier (Waters) AUTOSPEC_PREMIER 57 + studyAssayPlatform AXIMA Mega TOF (Shimadzu) AXIMA_MEGA_TOF 58 + studyAssayPlatform AXIMA Performance MALDI TOF/TOF (Shimadzu) AXIMA_PERF_MALDI_TOF 59 + studyAssayPlatform A-10 Analyzer (Apogee) A_10_ANALYZER 60 + studyAssayPlatform A-40-MiniFCM (Apogee) A_40_MINIFCM 61 + studyAssayPlatform Bactiflow (Chemunex SA) BACTIFLOW 62 + studyAssayPlatform Base4innovation BASE4INNOVATION 63 + studyAssayPlatform BD BACTEC MGIT 320 BD_BACTEC_MGIT_320 64 + studyAssayPlatform BD BACTEC MGIT 960 BD_BACTEC_MGIT_960 65 + studyAssayPlatform BD Radiometric BACTEC 460TB BD_RADIO_BACTEC_460TB 66 + studyAssayPlatform BioNanomatrix BIONANOMATRIX 67 + studyAssayPlatform Cell Lab Quanta SC (Becman Coulter) CELL_LAB_QUANTA_SC 68 + studyAssayPlatform Clarus 560 D GC/MS (PerkinElmer) CLARUS_560_D_GCMS 69 + studyAssayPlatform Clarus 560 S GC/MS (PerkinElmer) CLARUS_560_S_GCMS 70 + studyAssayPlatform Clarus 600 GC/MS (PerkinElmer) CLARUS_600_GCMS 71 + studyAssayPlatform Complete Genomics COMPLETE_GENOMICS 72 + studyAssayPlatform Cyan (Dako Cytomation) CYAN 73 + studyAssayPlatform CyFlow ML (Partec) CYFLOW_ML 74 + studyAssayPlatform Cyow SL (Partec) CYFLOW_SL 75 + studyAssayPlatform CyFlow SL3 (Partec) CYFLOW_SL3 76 + studyAssayPlatform CytoBuoy (Cyto Buoy Inc) CYTOBUOY 77 + studyAssayPlatform CytoSence (Cyto Buoy Inc) CYTOSENCE 78 + studyAssayPlatform CytoSub (Cyto Buoy Inc) CYTOSUB 79 + studyAssayPlatform Danaher DANAHER 80 + studyAssayPlatform DFS (Thermo Scientific) DFS 81 + studyAssayPlatform Exactive(Thermo Scientific) EXACTIVE 82 + studyAssayPlatform FACS Canto (Becton Dickinson) FACS_CANTO 83 + studyAssayPlatform FACS Canto2 (Becton Dickinson) FACS_CANTO2 84 + studyAssayPlatform FACS Scan (Becton Dickinson) FACS_SCAN 85 + studyAssayPlatform FC 500 (Becman Coulter) FC_500 86 + studyAssayPlatform GCmate II GC/MS (Jeol) GCMATE_II 87 + studyAssayPlatform GCMS-QP2010 Plus (Shimadzu) GCMS_QP2010_PLUS 88 + studyAssayPlatform GCMS-QP2010S Plus (Shimadzu) GCMS_QP2010S_PLUS 89 + studyAssayPlatform GCT Premier (Waters) GCT_PREMIER 90 + studyAssayPlatform GENEQ GENEQ 91 + studyAssayPlatform Genome Corp. GENOME_CORP 92 + studyAssayPlatform GenoVoxx GENOVOXX 93 + studyAssayPlatform GnuBio GNUBIO 94 + studyAssayPlatform Guava EasyCyte Mini (Millipore) GUAVA_EASYCYTE_MINI 95 + studyAssayPlatform Guava EasyCyte Plus (Millipore) GUAVA_EASYCYTE_PLUS 96 + studyAssayPlatform Guava Personal Cell Analysis (Millipore) GUAVA_PERSONAL_CELL 97 + studyAssayPlatform Guava Personal Cell Analysis-96 (Millipore) GUAVA_PERSONAL_CELL_96 98 + studyAssayPlatform Helicos BioSciences HELICOS_BIO 99 + studyAssayPlatform Illumina ILLUMINA 100 + studyAssayPlatform Indirect proportion method on LJ medium INDIRECT_LJ_MEDIUM 101 + studyAssayPlatform Indirect proportion method on Middlebrook Agar 7H9 INDIRECT_AGAR_7H9 102 + studyAssayPlatform Indirect proportion method on Middlebrook Agar 7H10 INDIRECT_AGAR_7H10 103 + studyAssayPlatform Indirect proportion method on Middlebrook Agar 7H11 INDIRECT_AGAR_7H11 104 + studyAssayPlatform inFlux Analyzer (Cytopeia) INFLUX_ANALYZER 105 + studyAssayPlatform Intelligent Bio-Systems INTELLIGENT_BIOSYSTEMS 106 + studyAssayPlatform ITQ 700 (Thermo Scientific) ITQ_700 107 + studyAssayPlatform ITQ 900 (Thermo Scientific) ITQ_900 108 + studyAssayPlatform ITQ 1100 (Thermo Scientific) ITQ_1100 109 + studyAssayPlatform JMS-53000 SpiralTOF (Jeol) JMS_53000_SPIRAL 110 + studyAssayPlatform LaserGen LASERGEN 111 + studyAssayPlatform LCMS-2020 (Shimadzu) LCMS_2020 112 + studyAssayPlatform LCMS-2010EV (Shimadzu) LCMS_2010EV 113 + studyAssayPlatform LCMS-IT-TOF (Shimadzu) LCMS_IT_TOF 114 + studyAssayPlatform Li-Cor LI_COR 115 + studyAssayPlatform Life Tech LIFE_TECH 116 + studyAssayPlatform LightSpeed Genomics LIGHTSPEED_GENOMICS 117 + studyAssayPlatform LCT Premier XE (Waters) LCT_PREMIER_XE 118 + studyAssayPlatform LCQ Deca XP MAX (Thermo Scientific) LCQ_DECA_XP_MAX 119 + studyAssayPlatform LCQ Fleet (Thermo Scientific) LCQ_FLEET 120 + studyAssayPlatform LXQ (Thermo Scientific) LXQ_THERMO 121 + studyAssayPlatform LTQ Classic (Thermo Scientific) LTQ_CLASSIC 122 + studyAssayPlatform LTQ XL (Thermo Scientific) LTQ_XL 123 + studyAssayPlatform LTQ Velos (Thermo Scientific) LTQ_VELOS 124 + studyAssayPlatform LTQ Orbitrap Classic (Thermo Scientific) LTQ_ORBITRAP_CLASSIC 125 + studyAssayPlatform LTQ Orbitrap XL (Thermo Scientific) LTQ_ORBITRAP_XL 126 + studyAssayPlatform LTQ Orbitrap Discovery (Thermo Scientific) LTQ_ORBITRAP_DISCOVERY 127 + studyAssayPlatform LTQ Orbitrap Velos (Thermo Scientific) LTQ_ORBITRAP_VELOS 128 + studyAssayPlatform Luminex 100 (Luminex) LUMINEX_100 129 + studyAssayPlatform Luminex 200 (Luminex) LUMINEX_200 130 + studyAssayPlatform MACS Quant (Miltenyi) MACS_QUANT 131 + studyAssayPlatform MALDI SYNAPT G2 HDMS (Waters) MALDI_SYNAPT_G2_HDMS 132 + studyAssayPlatform MALDI SYNAPT G2 MS (Waters) MALDI_SYNAPT_G2_MS 133 + studyAssayPlatform MALDI SYNAPT HDMS (Waters) MALDI_SYNAPT_HDMS 134 + studyAssayPlatform MALDI SYNAPT MS (Waters) MALDI_SYNAPT_MS 135 + studyAssayPlatform MALDI micro MX (Waters) MALDI_MICROMX 136 + studyAssayPlatform maXis (Bruker) MAXIS 137 + studyAssayPlatform maXis G4 (Bruker) MAXISG4 138 + studyAssayPlatform microflex LT MALDI-TOF MS (Bruker) MICROFLEX_LT_MALDI_TOF_MS 139 + studyAssayPlatform microflex LRF MALDI-TOF MS (Bruker) MICROFLEX_LRF_MALDI_TOF_MS 140 + studyAssayPlatform microflex III MALDI-TOF MS (Bruker) MICROFLEX_III_TOF_MS 141 + studyAssayPlatform micrOTOF II ESI TOF (Bruker) MICROTOF_II_ESI_TOF 142 + studyAssayPlatform micrOTOF-Q II ESI-Qq-TOF (Bruker) MICROTOF_Q_II_ESI_QQ_TOF 143 + studyAssayPlatform microplate Alamar Blue (resazurin) colorimetric method MICROPLATE_ALAMAR_BLUE_COLORIMETRIC 144 + studyAssayPlatform Mstation (Jeol) MSTATION 145 + studyAssayPlatform MSQ Plus (Thermo Scientific) MSQ_PLUS 146 + studyAssayPlatform NABsys NABSYS 147 + studyAssayPlatform Nanophotonics Biosciences NANOPHOTONICS_BIOSCIENCES 148 + studyAssayPlatform Network Biosystems NETWORK_BIOSYSTEMS 149 + studyAssayPlatform Nimblegen NIMBLEGEN 150 + studyAssayPlatform Oxford Nanopore Technologies OXFORD_NANOPORE_TECHNOLOGIES 151 + studyAssayPlatform Pacific Biosciences PACIFIC_BIOSCIENCES 152 + studyAssayPlatform Population Genetics Technologies POPULATION_GENETICS_TECHNOLOGIES 153 + studyAssayPlatform Q1000GC UltraQuad (Jeol) Q1000GC_ULTRAQUAD 154 + studyAssayPlatform Quattro micro API (Waters) QUATTRO_MICRO_API 155 + studyAssayPlatform Quattro micro GC (Waters) QUATTRO_MICRO_GC 156 + studyAssayPlatform Quattro Premier XE (Waters) QUATTRO_PREMIER_XE 157 + studyAssayPlatform QSTAR (AB Sciex) QSTAR 158 + studyAssayPlatform Reveo REVEO 159 + studyAssayPlatform Roche ROCHE 160 + studyAssayPlatform Seirad SEIRAD 161 + studyAssayPlatform solariX hybrid Qq-FTMS (Bruker) SOLARIX_HYBRID_QQ_FTMS 162 + studyAssayPlatform Somacount (Bently Instruments) SOMACOUNT 163 + studyAssayPlatform SomaScope (Bently Instruments) SOMASCOPE 164 + studyAssayPlatform SYNAPT G2 HDMS (Waters) SYNAPT_G2_HDMS 165 + studyAssayPlatform SYNAPT G2 MS (Waters) SYNAPT_G2_MS 166 + studyAssayPlatform SYNAPT HDMS (Waters) SYNAPT_HDMS 167 + studyAssayPlatform SYNAPT MS (Waters) SYNAPT_MS 168 + studyAssayPlatform TripleTOF 5600 (AB Sciex) TRIPLETOF_5600 169 + studyAssayPlatform TSQ Quantum Ultra (Thermo Scientific) TSQ_QUANTUM_ULTRA 170 + studyAssayPlatform TSQ Quantum Access (Thermo Scientific) TSQ_QUANTUM_ACCESS 171 + studyAssayPlatform TSQ Quantum Access MAX (Thermo Scientific) TSQ_QUANTUM_ACCESS_MAX 172 + studyAssayPlatform TSQ Quantum Discovery MAX (Thermo Scientific) TSQ_QUANTUM_DISCOVERY_MAX 173 + studyAssayPlatform TSQ Quantum GC (Thermo Scientific) TSQ_QUANTUM_GC 174 + studyAssayPlatform TSQ Quantum XLS (Thermo Scientific) TSQ_QUANTUM_XLS 175 + studyAssayPlatform TSQ Vantage (Thermo Scientific) TSQ_VANTAGE 176 + studyAssayPlatform ultrafleXtreme MALDI-TOF MS (Bruker) ULTRAFLEXTREME_MALDI_TOF_MS 177 + studyAssayPlatform VisiGen Biotechnologies VISIGEN_BIO 178 + studyAssayPlatform Xevo G2 QTOF (Waters) XEVO_G2_QTOF 179 + studyAssayPlatform Xevo QTof MS (Waters) XEVO_QTOF_MS 180 + studyAssayPlatform Xevo TQ MS (Waters) XEVO_TQ_MS 181 + studyAssayPlatform Xevo TQ-S (Waters) XEVO_TQ_S 182 studyAssayPlatform Other OTHER_PLATFORM 183 \ No newline at end of file diff --git a/scripts/search/data/shape/shapefile.zip b/scripts/search/data/shape/shapefile.zip new file mode 100644 index 00000000000..c4da60f0b80 Binary files /dev/null and b/scripts/search/data/shape/shapefile.zip differ diff --git a/scripts/zipdownload/src/main/java/edu/harvard/iq/dataverse/custom/service/download/ZipDownloadService.java b/scripts/zipdownload/src/main/java/edu/harvard/iq/dataverse/custom/service/download/ZipDownloadService.java index cda7cbb9505..4b66ee770d5 100644 --- a/scripts/zipdownload/src/main/java/edu/harvard/iq/dataverse/custom/service/download/ZipDownloadService.java +++ b/scripts/zipdownload/src/main/java/edu/harvard/iq/dataverse/custom/service/download/ZipDownloadService.java @@ -127,6 +127,7 @@ public void processFiles() { } Set zippedFolders = new HashSet<>(); + Set fileNamesList = new HashSet<>(); for (String [] fileEntry : jobFiles) { String storageLocation = fileEntry[0]; @@ -144,13 +145,15 @@ public void processFiles() { InputStream inputStream = this.directAccessUtil.openDirectAccess(storageLocation); - // (potential?) TODO: String zipEntryName = checkZipEntryName(fileName); + String zipEntryName = checkZipEntryName(fileName, fileNamesList); // this may not be needed anymore - some extra sanitizing of the file // name we used to have to do - since all the values in a current Dataverse - // database may already be santized enough. + // database may already be santized enough. + // (Edit: Yes, we still need this - there are still datasets with multiple + // files with duplicate names; this method takes care of that) if (inputStream != null && this.zipOutputStream != null) { - ZipEntry entry = new ZipEntry(fileName); + ZipEntry entry = new ZipEntry(zipEntryName); byte[] bytes = new byte[2 * 8192]; int read = 0; @@ -158,8 +161,8 @@ public void processFiles() { try { // Does this file have a folder name? - if (hasFolder(fileName)) { - addFolderToZipStream(getFolderName(fileName), zippedFolders); + if (hasFolder(zipEntryName)) { + addFolderToZipStream(getFolderName(zipEntryName), zippedFolders); } this.zipOutputStream.putNextEntry(entry); @@ -168,7 +171,6 @@ public void processFiles() { this.zipOutputStream.write(bytes, 0, read); readSize += read; } - inputStream.close(); this.zipOutputStream.closeEntry(); /*if (fileSize == readSize) { @@ -178,6 +180,12 @@ public void processFiles() { }*/ } catch (IOException ioex) { System.err.println("Failed to compress "+storageLocation); + } finally { + try { + inputStream.close(); + } catch (IOException ioexIgnore) { + System.err.println("Warning: IO exception trying to close input stream - "+storageLocation); + } } } else { System.err.println("Failed to access "+storageLocation); @@ -237,4 +245,21 @@ private void addFolderToZipStream(String folderName, Set zippedFolders) } } } + + // check for and process duplicates: + private String checkZipEntryName(String originalName, Set fileNames) { + String name = originalName; + int fileSuffix = 1; + int extensionIndex = originalName.lastIndexOf("."); + + while (fileNames.contains(name)) { + if (extensionIndex != -1) { + name = originalName.substring(0, extensionIndex) + "_" + fileSuffix++ + originalName.substring(extensionIndex); + } else { + name = originalName + "_" + fileSuffix++; + } + } + fileNames.add(name); + return name; + } } diff --git a/src/main/java/ValidationMessages.properties b/src/main/java/ValidationMessages.properties index 9c4f69252cf..4dfce141f41 100644 --- a/src/main/java/ValidationMessages.properties +++ b/src/main/java/ValidationMessages.properties @@ -18,6 +18,7 @@ dataverse.aliasLength=Alias must be at most 60 characters. dataverse.aliasNotnumber=Alias should not be a number dataverse.nameIllegalCharacters=Found an illegal character(s). Valid characters are a-Z, 0-9, '_', and '-'. dataverse.category=Please select a category for your dataverse. +dataverse.contact=Please enter a valid email address. contenttype.slash=Content-Type must contain a slash setspec.notNumber=Setspec should not be a number setspec.maxLength=Setspec must be at most 30 characters. @@ -47,3 +48,4 @@ password.current=Please enter your current password. password.validate=Password reset page default email message. guestbook.name=Enter a name for the guestbook + diff --git a/src/main/java/ValidationMessages_fr.properties b/src/main/java/ValidationMessages_fr.properties deleted file mode 100644 index 40c43b00969..00000000000 --- a/src/main/java/ValidationMessages_fr.properties +++ /dev/null @@ -1,49 +0,0 @@ -user.firstName=Veuillez entrer votre prénom. -user.lastName=Veuillez entrer votre nom de famille. -user.invalidEmail=Veuillez entrer une adresse courriel valide. -user.enterUsername=Veuillez entrer un nom d'utilisateur. -user.usernameLength=Le nom d'utilisateur doit comporter entre 2 et 60 caractères. -user.illegalCharacters=Caractère(s) non valide(s) utilisé(s). Les caractères valides sont a-Z, 0-9, '_', '-' et '.'. - -user.enterNickname=Veuillez entrer un pseudonyme. -user.nicknameLength=Le pseudonyme ne peut excéder 30 caractères. -user.nicknameNotnumber=Le pseudonyme ne devrait pas être un nombre - -dataset.templatename=Veuillez ajouter un nom pour le modèle d'ensemble de données. -dataset.nameLength=Le nom ne peut excéder 255 caractères. - -dataverse.name=Veuillez entrer un nom. -dataverse.alias=Veuillez entrer un alias. -dataverse.aliasLength=L'alias ne peut excéder 60 caractères. -dataverse.aliasNotnumber=L'alias ne devrait pas être un nombre. -dataverse.nameIllegalCharacters=Caractère(s) non valide(s) utilisé(s). Les caractères valides sont a-Z, 0-9, '_', '-'. -dataverse.category=Veuillez sélectionner une catégorie pour votre dataverse. -contenttype.slash=Le type de contenu doit contenir une barre oblique. -setspec.notNumber=Le nom (Setspec) ne devrait pas être un nombre. -setspec.maxLength=Le nom (Setspec) ne peut excéder 30 caractères. - -role.name=Un rôle doit avoir un nom. -desc.maxLength=Le description ne peut excéder 255 caractères. -alias.maxLength=L'alias ne peut excéder 26 caractères. -alias.illegalCharacters=L'alias ne peut être vide. Les caractères valides sont a-Z, 0-9, '_', '-'. - -custom.response=Veuillez entrer la réponse. -custom.questiontext=Veuillez entrer le texte de la question. -filename.illegalCharacters=Le nom du fichier ne peut contenir aucun des caractères suivants\u00A0: \ / : * ? " < > | ; # . -directoryname.illegalCharacters=Le nom du répertoire ne peut pas être suivi ni précédé d'un caractère séparateur de fichiers. -filename.blank=Veuillez spécifier un nom de fichier. - - -map.layername=Veuillez spécifier un nom de couche cartographique. -map.layerlink=Veuillez spécifier un lien de couche cartographique. -map.link=Veuillez spécifier un lien de carte intégrée. -map.imagelink=Veuillez spécifier un lien d'image de carte. -map.username=Veuillez spécifier un nom d'utilisateur WorldMap. - -oauth.username=Veuillez entrer votre nom d'utilisateur. - -password.retype=Le nouveau mot de passe est vide\u00A0: veuillez le retaper à nouveau. -password.current=Veuillez entrer votre mot de passe actuel. -password.validate=Page de réinitialisation du mot de passe par défaut. - -guestbook.name=Saisir un nom pour le registre de visiteurs. diff --git a/src/main/java/edu/harvard/iq/dataverse/DOIDataCiteRegisterService.java b/src/main/java/edu/harvard/iq/dataverse/DOIDataCiteRegisterService.java index ca9e55e2f92..ba503a18d22 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DOIDataCiteRegisterService.java +++ b/src/main/java/edu/harvard/iq/dataverse/DOIDataCiteRegisterService.java @@ -118,20 +118,11 @@ public String reserveIdentifier(String identifier, Map metadata, } else { rc.setUrl(target); } - try { - DataCiteRESTfullClient client = getClient(); - retString = client.postMetadata(xmlMetadata); - } catch (UnsupportedEncodingException ex) { - Logger.getLogger(DOIDataCiteRegisterService.class.getName()).log(Level.SEVERE, null, ex); - } - } else { - try { - DataCiteRESTfullClient client = getClient(); - retString = client.postMetadata(xmlMetadata); - } catch (UnsupportedEncodingException ex) { - Logger.getLogger(DOIDataCiteRegisterService.class.getName()).log(Level.SEVERE, null, ex); - } } + + DataCiteRESTfullClient client = getClient(); + retString = client.postMetadata(xmlMetadata); + return retString; } @@ -149,22 +140,12 @@ public String registerIdentifier(String identifier, Map metadata } else { rc.setUrl(target); } - try { - DataCiteRESTfullClient client = getClient(); - retString = client.postMetadata(xmlMetadata); - client.postUrl(identifier.substring(identifier.indexOf(":") + 1), target); - } catch (UnsupportedEncodingException ex) { - Logger.getLogger(DOIDataCiteRegisterService.class.getName()).log(Level.SEVERE, null, ex); - } - } else { - try { - DataCiteRESTfullClient client = getClient(); - retString = client.postMetadata(xmlMetadata); - client.postUrl(identifier.substring(identifier.indexOf(":") + 1), target); - } catch (UnsupportedEncodingException ex) { - Logger.getLogger(DOIDataCiteRegisterService.class.getName()).log(Level.SEVERE, null, ex); - } } + + DataCiteRESTfullClient client = getClient(); + retString = client.postMetadata(xmlMetadata); + client.postUrl(identifier.substring(identifier.indexOf(":") + 1), target); + return retString; } diff --git a/src/main/java/edu/harvard/iq/dataverse/DataCiteRESTfullClient.java b/src/main/java/edu/harvard/iq/dataverse/DataCiteRESTfullClient.java index 913dc4d0034..491f19ab36c 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DataCiteRESTfullClient.java +++ b/src/main/java/edu/harvard/iq/dataverse/DataCiteRESTfullClient.java @@ -9,8 +9,6 @@ import java.io.Closeable; import java.io.IOException; -import java.io.UnsupportedEncodingException; - import java.util.logging.Level; import java.util.logging.Logger; @@ -102,24 +100,19 @@ public String getUrl(String doi) { * @param url * @return */ - public String postUrl(String doi, String url) throws UnsupportedEncodingException { + public String postUrl(String doi, String url) throws IOException { HttpPost httpPost = new HttpPost(this.url + "/doi"); httpPost.setHeader("Content-Type", "text/plain;charset=UTF-8"); httpPost.setEntity(new StringEntity("doi=" + doi + "\nurl=" + url, "utf-8")); - try { - HttpResponse response = httpClient.execute(httpPost,context); - String data = EntityUtils.toString(response.getEntity(), encoding); - if (response.getStatusLine().getStatusCode() != 201) { - String errMsg = "Response code: " + response.getStatusLine().getStatusCode() + ", " + data; - logger.log(Level.SEVERE,errMsg); - throw new RuntimeException(errMsg); - } - return data; - } catch (IOException ioe) { - logger.log(Level.SEVERE,"IOException when post url"); - throw new RuntimeException("IOException when post url", ioe); + HttpResponse response = httpClient.execute(httpPost, context); + String data = EntityUtils.toString(response.getEntity(), encoding); + if (response.getStatusLine().getStatusCode() != 201) { + String errMsg = "Response from postUrl: " + response.getStatusLine().getStatusCode() + ", " + data; + logger.log(Level.SEVERE, errMsg); + throw new IOException(errMsg); } + return data; } /** @@ -135,7 +128,7 @@ public String getMetadata(String doi) { HttpResponse response = httpClient.execute(httpGet,context); String data = EntityUtils.toString(response.getEntity(), encoding); if (response.getStatusLine().getStatusCode() != 200) { - String errMsg = "Response code: " + response.getStatusLine().getStatusCode() + ", " + data; + String errMsg = "Response from getMetadata: " + response.getStatusLine().getStatusCode() + ", " + data; logger.log(Level.SEVERE, errMsg); throw new RuntimeException(errMsg); } @@ -152,21 +145,16 @@ public String getMetadata(String doi) { * @param doi * @return boolean true if identifier already exists on DataCite site */ - public boolean testDOIExists(String doi) { - HttpGet httpGet = new HttpGet(this.url + "/metadata/" + doi); - httpGet.setHeader("Accept", "application/xml"); - try { - HttpResponse response = httpClient.execute(httpGet,context); - if (response.getStatusLine().getStatusCode() != 200) { - EntityUtils.consumeQuietly(response.getEntity()); - return false; - } + public boolean testDOIExists(String doi) throws IOException { + HttpGet httpGet = new HttpGet(this.url + "/metadata/" + doi); + httpGet.setHeader("Accept", "application/xml"); + HttpResponse response = httpClient.execute(httpGet, context); + if (response.getStatusLine().getStatusCode() != 200) { EntityUtils.consumeQuietly(response.getEntity()); - return true; - } catch (IOException ioe) { - logger.log(Level.SEVERE, "IOException when get metadata"); - throw new RuntimeException("IOException when get metadata", ioe); - } + return false; + } + EntityUtils.consumeQuietly(response.getEntity()); + return true; } /** @@ -182,7 +170,7 @@ public String postMetadata(String metadata) throws IOException { HttpResponse response = httpClient.execute(httpPost, context); String data = EntityUtils.toString(response.getEntity(), encoding); if (response.getStatusLine().getStatusCode() != 201) { - String errMsg = "Response code: " + response.getStatusLine().getStatusCode() + ", " + data; + String errMsg = "Response from postMetadata: " + response.getStatusLine().getStatusCode() + ", " + data; logger.log(Level.SEVERE, errMsg); throw new IOException(errMsg); } diff --git a/src/main/java/edu/harvard/iq/dataverse/DataFileServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/DataFileServiceBean.java index 4d04ee1889d..1aadcc9851e 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DataFileServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/DataFileServiceBean.java @@ -580,7 +580,7 @@ public void findFileMetadataOptimizedExperimental(Dataset owner, DatasetVersion int i = 0; - List dataTableResults = em.createNativeQuery("SELECT t0.ID, t0.DATAFILE_ID, t0.UNF, t0.CASEQUANTITY, t0.VARQUANTITY, t0.ORIGINALFILEFORMAT, t0.ORIGINALFILESIZE FROM dataTable t0, dataFile t1, dvObject t2 WHERE ((t0.DATAFILE_ID = t1.ID) AND (t1.ID = t2.ID) AND (t2.OWNER_ID = " + owner.getId() + ")) ORDER BY t0.ID").getResultList(); + List dataTableResults = em.createNativeQuery("SELECT t0.ID, t0.DATAFILE_ID, t0.UNF, t0.CASEQUANTITY, t0.VARQUANTITY, t0.ORIGINALFILEFORMAT, t0.ORIGINALFILESIZE, t0.ORIGINALFILENAME FROM dataTable t0, dataFile t1, dvObject t2 WHERE ((t0.DATAFILE_ID = t1.ID) AND (t1.ID = t2.ID) AND (t2.OWNER_ID = " + owner.getId() + ")) ORDER BY t0.ID").getResultList(); for (Object[] result : dataTableResults) { DataTable dataTable = new DataTable(); @@ -598,6 +598,8 @@ public void findFileMetadataOptimizedExperimental(Dataset owner, DatasetVersion dataTable.setOriginalFileSize((Long)result[6]); + dataTable.setOriginalFileName((String)result[7]); + dataTables.add(dataTable); datatableMap.put(fileId, i++); @@ -858,8 +860,10 @@ private List retrieveFileMetadataForVersion(Dataset dataset, Datas fileMetadata.setDatasetVersion(version); - //fileMetadata.setDataFile(dataset.getFiles().get(file_list_id)); + // Link the FileMetadata object to the DataFile: fileMetadata.setDataFile(dataFiles.get(file_list_id)); + // ... and the DataFile back to the FileMetadata: + fileMetadata.getDataFile().getFileMetadatas().add(fileMetadata); String description = (String) result[2]; diff --git a/src/main/java/edu/harvard/iq/dataverse/Dataset.java b/src/main/java/edu/harvard/iq/dataverse/Dataset.java index 13a8692fdd4..4cf95dda250 100644 --- a/src/main/java/edu/harvard/iq/dataverse/Dataset.java +++ b/src/main/java/edu/harvard/iq/dataverse/Dataset.java @@ -39,6 +39,10 @@ * @author skraffmiller */ @NamedQueries({ + @NamedQuery(name = "Dataset.findIdStale", + query = "SELECT d.id FROM Dataset d WHERE d.indexTime is NULL OR d.indexTime < d.modificationTime"), + @NamedQuery(name = "Dataset.findIdStalePermission", + query = "SELECT d.id FROM Dataset d WHERE d.permissionIndexTime is NULL OR d.permissionIndexTime < d.permissionModificationTime"), @NamedQuery(name = "Dataset.findByIdentifier", query = "SELECT d FROM Dataset d WHERE d.identifier=:identifier"), @NamedQuery(name = "Dataset.findByIdentifierAuthorityProtocol", diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java b/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java index d1cfb184462..4ffd7d05d3f 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java @@ -1886,7 +1886,7 @@ private String init(boolean initFull) { //retrieveDatasetVersionResponse = datasetVersionService.retrieveDatasetVersionByVersionId(versionId); } - this.maxFileUploadSizeInBytes = systemConfig.getMaxFileUploadSizeForStore(dataset.getOwner().getEffectiveStorageDriverId()); + this.maxFileUploadSizeInBytes = systemConfig.getMaxFileUploadSizeForStore(dataset.getEffectiveStorageDriverId()); if (retrieveDatasetVersionResponse == null) { @@ -3658,7 +3658,7 @@ public String save() { // have been created in the dataset. dataset = datasetService.find(dataset.getId()); - List filesAdded = ingestService.saveAndAddFilesToDataset(dataset.getEditVersion(), newFiles); + List filesAdded = ingestService.saveAndAddFilesToDataset(dataset.getEditVersion(), newFiles, false); newFiles.clear(); // and another update command: diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/DatasetServiceBean.java index c4049f3be00..c1efe119fd2 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetServiceBean.java @@ -172,8 +172,15 @@ public List filterByPidQuery(String filterQuery) { public List findAll() { return em.createQuery("select object(o) from Dataset as o order by o.id", Dataset.class).getResultList(); } - - + + public List findIdStale() { + return em.createNamedQuery("Dataset.findIdStale").getResultList(); + } + + public List findIdStalePermission() { + return em.createNamedQuery("Dataset.findIdStalePermission").getResultList(); + } + public List findAllLocalDatasetIds() { return em.createQuery("SELECT o.id FROM Dataset o WHERE o.harvestedFrom IS null ORDER BY o.id", Long.class).getResultList(); } diff --git a/src/main/java/edu/harvard/iq/dataverse/Dataverse.java b/src/main/java/edu/harvard/iq/dataverse/Dataverse.java index 75dbb39e2ca..5aab9ef9a9e 100644 --- a/src/main/java/edu/harvard/iq/dataverse/Dataverse.java +++ b/src/main/java/edu/harvard/iq/dataverse/Dataverse.java @@ -43,6 +43,8 @@ * @author mbarsinai */ @NamedQueries({ + @NamedQuery(name = "Dataverse.findIdStale",query = "SELECT d.id FROM Dataverse d WHERE d.indexTime is NULL OR d.indexTime < d.modificationTime"), + @NamedQuery(name = "Dataverse.findIdStalePermission",query = "SELECT d.id FROM Dataverse d WHERE d.permissionIndexTime is NULL OR d.permissionIndexTime < d.permissionModificationTime"), @NamedQuery(name = "Dataverse.ownedObjectsById", query = "SELECT COUNT(obj) FROM DvObject obj WHERE obj.owner.id=:id"), @NamedQuery(name = "Dataverse.findAll", query = "SELECT d FROM Dataverse d order by d.name"), @NamedQuery(name = "Dataverse.findRoot", query = "SELECT d FROM Dataverse d where d.owner.id=null"), @@ -151,7 +153,7 @@ public String getIndexableCategoryName() { private String affiliation; - private String storageDriver=null; + ///private String storageDriver=null; // Note: We can't have "Remove" here, as there are role assignments that refer // to this role. So, adding it would mean violating a forign key contstraint. @@ -761,32 +763,4 @@ public boolean isAncestorOf( DvObject other ) { } return false; } - - public String getEffectiveStorageDriverId() { - String id = storageDriver; - if(StringUtils.isBlank(id)) { - if(this.getOwner() != null) { - id = this.getOwner().getEffectiveStorageDriverId(); - } else { - id= DataAccess.DEFAULT_STORAGE_DRIVER_IDENTIFIER; - } - } - return id; - } - - - public String getStorageDriverId() { - if(storageDriver==null) { - return DataAccess.UNDEFINED_STORAGE_DRIVER_IDENTIFIER; - } - return storageDriver; - } - - public void setStorageDriverId(String storageDriver) { - if(storageDriver!=null&&storageDriver.equals(DataAccess.UNDEFINED_STORAGE_DRIVER_IDENTIFIER)) { - this.storageDriver=null; - } else { - this.storageDriver = storageDriver; - } - } } diff --git a/src/main/java/edu/harvard/iq/dataverse/DataverseServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/DataverseServiceBean.java index 207dc829bf6..0e13936f0f7 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DataverseServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/DataverseServiceBean.java @@ -127,6 +127,13 @@ public Dataverse find(Object pk) { public List findAll() { return em.createNamedQuery("Dataverse.findAll").getResultList(); } + + public List findIdStale() { + return em.createNamedQuery("Dataverse.findIdStale").getResultList(); + } + public List findIdStalePermission() { + return em.createNamedQuery("Dataverse.findIdStalePermission").getResultList(); + } /** * @param numPartitions The number of partitions you intend to split the diff --git a/src/main/java/edu/harvard/iq/dataverse/DataverseSession.java b/src/main/java/edu/harvard/iq/dataverse/DataverseSession.java index e15badc994b..d8d73ceaf3e 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DataverseSession.java +++ b/src/main/java/edu/harvard/iq/dataverse/DataverseSession.java @@ -49,6 +49,20 @@ public class DataverseSession implements Serializable{ private static final Logger logger = Logger.getLogger(DataverseSession.class.getCanonicalName()); private boolean statusDismissed = false; + + /** + * If debug is set to true, some pages show extra debugging information to + * superusers. + * + * The way to set the Boolean to true is to pass debug=true as a query + * parameter. The Boolean will remain true (even if nothing is passed to it) + * until debug=false is passed. + * + * Because a boolean is false by default when it comes from a viewParam we + * use a Boolean instead. That way, if the debug viewParam is null, we can + * leave the state alone (see setDebug()). + */ + private Boolean debug; public User getUser() { if ( user == null ) { @@ -82,7 +96,22 @@ public boolean isStatusDismissed() { public void setStatusDismissed(boolean status) { statusDismissed = status; //MAD: Set to true to enable code! } - + + public Boolean getDebug() { + // Only superusers get extra debugging information. + if (!getUser().isSuperuser()) { + return false; + } + return debug; + } + + public void setDebug(Boolean debug) { + // Leave the debug state alone if nothing is passed. + if (debug != null) { + this.debug = debug; + } + } + public StaticPermissionQuery on( Dataverse d ) { return permissionsService.userOn(user, d); } diff --git a/src/main/java/edu/harvard/iq/dataverse/DvObject.java b/src/main/java/edu/harvard/iq/dataverse/DvObject.java index 8a2f710c428..f1041303fdd 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DvObject.java +++ b/src/main/java/edu/harvard/iq/dataverse/DvObject.java @@ -20,6 +20,8 @@ query = "SELECT o FROM DvObject o ORDER BY o.id"), @NamedQuery(name = "DvObject.findById", query = "SELECT o FROM DvObject o WHERE o.id=:id"), + @NamedQuery(name = "DvObject.checkExists", + query = "SELECT count(o) from DvObject o WHERE o.id=:id"), @NamedQuery(name = "DvObject.ownedObjectsById", query="SELECT COUNT(obj) FROM DvObject obj WHERE obj.owner.id=:id"), @NamedQuery(name = "DvObject.findByGlobalId", diff --git a/src/main/java/edu/harvard/iq/dataverse/DvObjectContainer.java b/src/main/java/edu/harvard/iq/dataverse/DvObjectContainer.java index e40eb1c2a3a..f6b396f4c00 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DvObjectContainer.java +++ b/src/main/java/edu/harvard/iq/dataverse/DvObjectContainer.java @@ -1,6 +1,8 @@ package edu.harvard.iq.dataverse; +import edu.harvard.iq.dataverse.dataaccess.DataAccess; import javax.persistence.MappedSuperclass; +import org.apache.commons.lang.StringUtils; /** * A {@link DvObject} that can contain other {@link DvObject}s. @@ -26,4 +28,32 @@ public boolean isEffectivelyPermissionRoot() { return isPermissionRoot() || (getOwner() == null); } + private String storageDriver=null; + + public String getEffectiveStorageDriverId() { + String id = storageDriver; + if (StringUtils.isBlank(id)) { + if (this.getOwner() != null) { + id = this.getOwner().getEffectiveStorageDriverId(); + } else { + id = DataAccess.DEFAULT_STORAGE_DRIVER_IDENTIFIER; + } + } + return id; + } + + public String getStorageDriverId() { + if (storageDriver == null) { + return DataAccess.UNDEFINED_STORAGE_DRIVER_IDENTIFIER; + } + return storageDriver; + } + + public void setStorageDriverId(String storageDriver) { + if (storageDriver != null && storageDriver.equals(DataAccess.UNDEFINED_STORAGE_DRIVER_IDENTIFIER)) { + this.storageDriver = null; + } else { + this.storageDriver = storageDriver; + } + } } diff --git a/src/main/java/edu/harvard/iq/dataverse/DvObjectServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/DvObjectServiceBean.java index 25f1d10f13f..4830c422d05 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DvObjectServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/DvObjectServiceBean.java @@ -72,7 +72,13 @@ public List findByAuthenticatedUserId(AuthenticatedUser user) { query.setParameter("releaseUserId", user.getId()); return query.getResultList(); } - + + public boolean checkExists(Long id) { + Query query = em.createNamedQuery("DvObject.checkExists"); + query.setParameter("id", id); + Long result =(Long)query.getSingleResult(); + return result > 0; + } // FIXME This type-by-string has to go, in favor of passing a class parameter. public DvObject findByGlobalId(String globalIdString, String typeString) { return findByGlobalId(globalIdString, typeString, false); diff --git a/src/main/java/edu/harvard/iq/dataverse/EditDatafilesPage.java b/src/main/java/edu/harvard/iq/dataverse/EditDatafilesPage.java index b6c4cc744b2..b28d5f2c471 100644 --- a/src/main/java/edu/harvard/iq/dataverse/EditDatafilesPage.java +++ b/src/main/java/edu/harvard/iq/dataverse/EditDatafilesPage.java @@ -453,7 +453,7 @@ public String initCreateMode(String modeToken, DatasetVersion version, MutableBo uploadedFiles = uploadedFilesList; selectedFiles = selectedFileMetadatasList; - this.maxFileUploadSizeInBytes = systemConfig.getMaxFileUploadSizeForStore(dataset.getOwner().getEffectiveStorageDriverId()); + this.maxFileUploadSizeInBytes = systemConfig.getMaxFileUploadSizeForStore(dataset.getEffectiveStorageDriverId()); this.multipleUploadFilesLimit = systemConfig.getMultipleUploadFilesLimit(); logger.fine("done"); @@ -490,7 +490,7 @@ public String init() { - this.maxFileUploadSizeInBytes = systemConfig.getMaxFileUploadSizeForStore(dataset.getOwner().getEffectiveStorageDriverId()); + this.maxFileUploadSizeInBytes = systemConfig.getMaxFileUploadSizeForStore(dataset.getEffectiveStorageDriverId()); this.multipleUploadFilesLimit = systemConfig.getMultipleUploadFilesLimit(); workingVersion = dataset.getEditVersion(); @@ -1137,7 +1137,7 @@ public String save() { } // Try to save the NEW files permanently: - List filesAdded = ingestService.saveAndAddFilesToDataset(workingVersion, newFiles); + List filesAdded = ingestService.saveAndAddFilesToDataset(workingVersion, newFiles, false); // reset the working list of fileMetadatas, as to only include the ones // that have been added to the version successfully: diff --git a/src/main/java/edu/harvard/iq/dataverse/FileDownloadServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/FileDownloadServiceBean.java index 683142fc5c4..a91a28d3d96 100644 --- a/src/main/java/edu/harvard/iq/dataverse/FileDownloadServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/FileDownloadServiceBean.java @@ -20,6 +20,7 @@ import edu.harvard.iq.dataverse.privateurl.PrivateUrlServiceBean; import edu.harvard.iq.dataverse.settings.SettingsServiceBean; import edu.harvard.iq.dataverse.util.FileUtil; +import edu.harvard.iq.dataverse.util.StringUtil; import java.io.IOException; import java.sql.Timestamp; import java.util.ArrayList; @@ -563,6 +564,10 @@ public void addFileToCustomZipJob(String key, DataFile dataFile, Timestamp times fileName = dataFile.getFileMetadata().getLabel(); } } + + if (StringUtil.nonEmpty(dataFile.getFileMetadata().getDirectoryLabel())) { + fileName = dataFile.getFileMetadata().getDirectoryLabel() + "/" + fileName; + } if (location != null && fileName != null) { em.createNativeQuery("INSERT INTO CUSTOMZIPSERVICEREQUEST (KEY, STORAGELOCATION, FILENAME, ISSUETIME) VALUES (" diff --git a/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java b/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java index 07d5944af05..655cdafe04c 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java @@ -156,6 +156,7 @@ import org.glassfish.jersey.media.multipart.FormDataParam; import com.amazonaws.services.s3.model.PartETag; +import java.util.Map.Entry; @Path("datasets") public class Datasets extends AbstractApiBean { @@ -2204,6 +2205,87 @@ public Response getMakeDataCountMetric(@PathParam("id") String idSupplied, @Path return wr.getResponse(); } } - + + @GET + @Path("{identifier}/storageDriver") + public Response getFileStore(@PathParam("identifier") String dvIdtf, + @Context UriInfo uriInfo, @Context HttpHeaders headers) throws WrappedResponse { + + Dataset dataset; + + try { + dataset = findDatasetOrDie(dvIdtf); + } catch (WrappedResponse ex) { + return error(Response.Status.NOT_FOUND, "No such dataset"); + } + + return response(req -> ok(dataset.getEffectiveStorageDriverId())); + } + + @PUT + @Path("{identifier}/storageDriver") + public Response setFileStore(@PathParam("identifier") String dvIdtf, + String storageDriverLabel, + @Context UriInfo uriInfo, @Context HttpHeaders headers) throws WrappedResponse { + + // Superuser-only: + AuthenticatedUser user; + try { + user = findAuthenticatedUserOrDie(); + } catch (WrappedResponse ex) { + return error(Response.Status.BAD_REQUEST, "Authentication is required."); + } + if (!user.isSuperuser()) { + return error(Response.Status.FORBIDDEN, "Superusers only."); + } + + Dataset dataset; + + try { + dataset = findDatasetOrDie(dvIdtf); + } catch (WrappedResponse ex) { + return error(Response.Status.NOT_FOUND, "No such dataset"); + } + + // We don't want to allow setting this to a store id that does not exist: + for (Entry store : DataAccess.getStorageDriverLabels().entrySet()) { + if (store.getKey().equals(storageDriverLabel)) { + dataset.setStorageDriverId(store.getValue()); + datasetService.merge(dataset); + return ok("Storage driver set to: " + store.getKey() + "/" + store.getValue()); + } + } + return error(Response.Status.BAD_REQUEST, + "No Storage Driver found for : " + storageDriverLabel); + } + + @DELETE + @Path("{identifier}/storageDriver") + public Response resetFileStore(@PathParam("identifier") String dvIdtf, + @Context UriInfo uriInfo, @Context HttpHeaders headers) throws WrappedResponse { + + // Superuser-only: + AuthenticatedUser user; + try { + user = findAuthenticatedUserOrDie(); + } catch (WrappedResponse ex) { + return error(Response.Status.BAD_REQUEST, "Authentication is required."); + } + if (!user.isSuperuser()) { + return error(Response.Status.FORBIDDEN, "Superusers only."); + } + + Dataset dataset; + + try { + dataset = findDatasetOrDie(dvIdtf); + } catch (WrappedResponse ex) { + return error(Response.Status.NOT_FOUND, "No such dataset"); + } + + dataset.setStorageDriverId(null); + datasetService.merge(dataset); + return ok("Storage reset to default: " + DataAccess.DEFAULT_STORAGE_DRIVER_IDENTIFIER); + } } diff --git a/src/main/java/edu/harvard/iq/dataverse/api/Index.java b/src/main/java/edu/harvard/iq/dataverse/api/Index.java index c1f5f6957e6..406d4b98663 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/Index.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/Index.java @@ -43,7 +43,10 @@ import java.util.ArrayList; import java.util.List; import java.util.Set; +import java.util.concurrent.ExecutionException; import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; import java.util.logging.Logger; import javax.ejb.EJB; import javax.ejb.EJBException; @@ -70,7 +73,7 @@ public class Index extends AbstractApiBean { @EJB IndexServiceBean indexService; @EJB - IndexBatchServiceBean indexAllService; + IndexBatchServiceBean indexBatchService; @EJB SolrIndexServiceBean solrIndexService; @EJB @@ -148,7 +151,7 @@ private Response indexAllOrSubset(Long numPartitionsSelected, Long partitionIdTo availablePartitionIdsBuilder.add(i); } - JsonObjectBuilder preview = indexAllService.indexAllOrSubsetPreview(numPartitions, partitionIdToProcess, skipIndexed); + JsonObjectBuilder preview = indexBatchService.indexAllOrSubsetPreview(numPartitions, partitionIdToProcess, skipIndexed); if (previewOnly) { preview.add("args", args); preview.add("availablePartitionIds", availablePartitionIdsBuilder); @@ -162,7 +165,7 @@ private Response indexAllOrSubset(Long numPartitionsSelected, Long partitionIdTo * @todo How can we expose the String returned from "index all" via * the API? */ - Future indexAllFuture = indexAllService.indexAllOrSubset(numPartitions, partitionIdToProcess, skipIndexed, previewOnly); + Future indexAllFuture = indexBatchService.indexAllOrSubset(numPartitions, partitionIdToProcess, skipIndexed, previewOnly); JsonObject workloadPreview = preview.build().getJsonObject("previewOfPartitionWorkload"); int dataverseCount = workloadPreview.getInt("dataverseCount"); int datasetCount = workloadPreview.getInt("datasetCount"); @@ -377,100 +380,59 @@ public Response indexPermissions(@PathParam("id") Long id) { return ok(indexResponse.getMessage()); } } - + /** + * Checks whether there are inconsistencies between the Solr index and + * the database, and reports back the status by content type + * @param sync - optional parameter, if set, then run the command + * synchronously. Else, return immediately, and report the status in server.log + * @return status report + */ @GET @Path("status") - public Response indexStatus() { - JsonObjectBuilder contentInDatabaseButStaleInOrMissingFromSolr = getContentInDatabaseButStaleInOrMissingFromSolr(); - - JsonObjectBuilder contentInSolrButNotDatabase; - try { - contentInSolrButNotDatabase = getContentInSolrButNotDatabase(); - } catch (SearchException ex) { - return error(Response.Status.INTERNAL_SERVER_ERROR, "Can not determine index status. " + ex.getLocalizedMessage() + ". Is Solr down? Exception: " + ex.getCause().getLocalizedMessage()); - } - - JsonObjectBuilder permissionsInDatabaseButStaleInOrMissingFromSolr = getPermissionsInDatabaseButStaleInOrMissingFromSolr(); - JsonObjectBuilder permissionsInSolrButNotDatabase = getPermissionsInSolrButNotDatabase(); - - JsonObjectBuilder data = Json.createObjectBuilder() - .add("contentInDatabaseButStaleInOrMissingFromIndex", contentInDatabaseButStaleInOrMissingFromSolr) - .add("contentInIndexButNotDatabase", contentInSolrButNotDatabase) - .add("permissionsInDatabaseButStaleInOrMissingFromIndex", permissionsInDatabaseButStaleInOrMissingFromSolr) - .add("permissionsInIndexButNotDatabase", permissionsInSolrButNotDatabase); - - return ok(data); - } - - private JsonObjectBuilder getContentInDatabaseButStaleInOrMissingFromSolr() { - List stateOrMissingDataverses = indexService.findStaleOrMissingDataverses(); - List staleOrMissingDatasets = indexService.findStaleOrMissingDatasets(); - JsonArrayBuilder jsonStateOrMissingDataverses = Json.createArrayBuilder(); - for (Dataverse dataverse : stateOrMissingDataverses) { - jsonStateOrMissingDataverses.add(dataverse.getId()); - } - JsonArrayBuilder datasetsInDatabaseButNotSolr = Json.createArrayBuilder(); - for (Dataset dataset : staleOrMissingDatasets) { - datasetsInDatabaseButNotSolr.add(dataset.getId()); - } - JsonObjectBuilder contentInDatabaseButStaleInOrMissingFromSolr = Json.createObjectBuilder() - /** - * @todo What about files? Currently files are always indexed - * along with their parent dataset - */ - .add("dataverses", jsonStateOrMissingDataverses.build().size()) - .add("datasets", datasetsInDatabaseButNotSolr.build().size()); - return contentInDatabaseButStaleInOrMissingFromSolr; - } - - private JsonObjectBuilder getContentInSolrButNotDatabase() throws SearchException { - List dataversesInSolrOnly = indexService.findDataversesInSolrOnly(); - List datasetsInSolrOnly = indexService.findDatasetsInSolrOnly(); - List filesInSolrOnly = indexService.findFilesInSolrOnly(); - JsonArrayBuilder dataversesInSolrButNotDatabase = Json.createArrayBuilder(); - for (Long dataverseId : dataversesInSolrOnly) { - dataversesInSolrButNotDatabase.add(dataverseId); - } - JsonArrayBuilder datasetsInSolrButNotDatabase = Json.createArrayBuilder(); - for (Long datasetId : datasetsInSolrOnly) { - datasetsInSolrButNotDatabase.add(datasetId); - } - JsonArrayBuilder filesInSolrButNotDatabase = Json.createArrayBuilder(); - for (Long fileId : filesInSolrOnly) { - filesInSolrButNotDatabase.add(fileId); - } - JsonObjectBuilder contentInSolrButNotDatabase = Json.createObjectBuilder() - /** - * @todo What about files? Currently files are always indexed - * along with their parent dataset - */ - .add("dataverses", dataversesInSolrButNotDatabase.build().size()) - .add("datasets", datasetsInSolrButNotDatabase.build().size()) - .add("files", filesInSolrButNotDatabase.build().size()); - return contentInSolrButNotDatabase; - } - - private JsonObjectBuilder getPermissionsInDatabaseButStaleInOrMissingFromSolr() { - List staleOrMissingPermissions; - staleOrMissingPermissions = solrIndexService.findPermissionsInDatabaseButStaleInOrMissingFromSolr(); - JsonArrayBuilder stalePermissionList = Json.createArrayBuilder(); - for (Long dvObjectId : staleOrMissingPermissions) { - stalePermissionList.add(dvObjectId); + public Response indexStatus(@QueryParam("sync") String sync) { + Future result = indexBatchService.indexStatus(); + if (sync != null) { + try { + JsonObjectBuilder status = result.get(); + return ok(status); + } catch (InterruptedException | ExecutionException e) { + return AbstractApiBean.error(Status.INTERNAL_SERVER_ERROR, "indexStatus method interrupted: " + e.getLocalizedMessage()); + } + } else { + return ok("Index Status Batch Job initiated, check log for job status."); } - return Json.createObjectBuilder() - .add("dvobjects", stalePermissionList.build().size()); } - - private JsonObjectBuilder getPermissionsInSolrButNotDatabase() { - List staleOrMissingPermissions = solrIndexService.findPermissionsInSolrNoLongerInDatabase(); - JsonArrayBuilder stalePermissionList = Json.createArrayBuilder(); - for (Long dvObjectId : staleOrMissingPermissions) { - stalePermissionList.add(dvObjectId); + /** + * Deletes "orphan" Solr documents (that don't match anything in the database). + * @param sync - optional parameter, if set, then run the command + * synchronously. Else, return immediately, and report the results in server.log + * @return what documents, if anything, was deleted + */ + @GET + @Path("clear-orphans") + /** + * Checks whether there are inconsistencies between the Solr index and the + * database, and reports back the status by content type + * + * @param sync - optional parameter, if !=null, then run the command + * synchronously. Else, return immediately, and report the status in + * server.log + * @return + */ + public Response clearOrphans(@QueryParam("sync") String sync) { + Future result = indexBatchService.clearOrphans(); + if (sync != null) { + try { + JsonObjectBuilder status = result.get(); + return ok(status); + } catch (InterruptedException | ExecutionException e) { + return AbstractApiBean.error(Status.INTERNAL_SERVER_ERROR, "indexStatus method interrupted: " + e.getLocalizedMessage()); + } + } else { + return ok("Clear Orphans Batch Job initiated, check log for job status."); } - return Json.createObjectBuilder() - .add("dvobjects", stalePermissionList.build().size()); } - + /** * We use the output of this method to generate our Solr schema.xml * diff --git a/src/main/java/edu/harvard/iq/dataverse/api/datadeposit/MediaResourceManagerImpl.java b/src/main/java/edu/harvard/iq/dataverse/api/datadeposit/MediaResourceManagerImpl.java index 6dfe605774f..23730885aab 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/datadeposit/MediaResourceManagerImpl.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/datadeposit/MediaResourceManagerImpl.java @@ -335,7 +335,7 @@ DepositReceipt replaceOrAddFiles(String uri, Deposit deposit, AuthCredentials au throw new SwordError(UriRegistry.ERROR_BAD_REQUEST, "Unable to add file(s) to dataset: " + violation.getMessage() + " The invalid value was \"" + violation.getInvalidValue() + "\"."); } else { - ingestService.saveAndAddFilesToDataset(editVersion, dataFiles); + ingestService.saveAndAddFilesToDataset(editVersion, dataFiles, false); } } else { diff --git a/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportGenericServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportGenericServiceBean.java index 9b60993b365..84195227b33 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportGenericServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportGenericServiceBean.java @@ -40,6 +40,8 @@ import javax.persistence.NoResultException; import javax.persistence.PersistenceContext; import javax.xml.stream.XMLInputFactory; +import net.handle.hdllib.HandleException; +import net.handle.hdllib.HandleResolver; /** @@ -366,27 +368,42 @@ private FieldDTO makeDTO(DatasetFieldType dataverseFieldType, FieldDTO value, St } private String getOtherIdFromDTO(DatasetVersionDTO datasetVersionDTO) { + List otherIds = new ArrayList<>(); for (Map.Entry entry : datasetVersionDTO.getMetadataBlocks().entrySet()) { String key = entry.getKey(); MetadataBlockDTO value = entry.getValue(); if ("citation".equals(key)) { for (FieldDTO fieldDTO : value.getFields()) { if (DatasetFieldConstant.otherId.equals(fieldDTO.getTypeName())) { - String otherId = ""; for (HashSet foo : fieldDTO.getMultipleCompound()) { for (FieldDTO next : foo) { if (DatasetFieldConstant.otherIdValue.equals(next.getTypeName())) { - otherId = next.getSinglePrimitive(); + otherIds.add(next.getSinglePrimitive()); } } - if (!otherId.isEmpty()){ - return otherId; - } } } } } } + if (!otherIds.isEmpty()) { + // We prefer doi or hdl identifiers like "doi:10.7910/DVN/1HE30F" + for (String otherId : otherIds) { + if (otherId.startsWith(GlobalId.DOI_PROTOCOL) || otherId.startsWith(GlobalId.HDL_PROTOCOL) || otherId.startsWith(GlobalId.DOI_RESOLVER_URL) || otherId.startsWith(GlobalId.HDL_RESOLVER_URL)) { + return otherId; + } + } + // But identifiers without hdl or doi like "10.6084/m9.figshare.12725075.v1" are also allowed + for (String otherId : otherIds) { + try { + HandleResolver hr = new HandleResolver(); + hr.resolveHandle(otherId); + return GlobalId.HDL_PROTOCOL + ":" + otherId; + } catch (HandleException e) { + logger.fine("Not a valid handle: " + e.toString()); + } + } + } return null; } diff --git a/src/main/java/edu/harvard/iq/dataverse/dataaccess/DataAccess.java b/src/main/java/edu/harvard/iq/dataverse/dataaccess/DataAccess.java index 0cf9883b240..0e2320401dd 100644 --- a/src/main/java/edu/harvard/iq/dataverse/dataaccess/DataAccess.java +++ b/src/main/java/edu/harvard/iq/dataverse/dataaccess/DataAccess.java @@ -20,6 +20,7 @@ package edu.harvard.iq.dataverse.dataaccess; +import edu.harvard.iq.dataverse.Dataset; import edu.harvard.iq.dataverse.DvObject; import java.io.IOException; import java.util.HashMap; @@ -147,7 +148,11 @@ public static StorageIO createNewStorageIO(T dvObject, S throw new IOException("getDataAccessObject: null or invalid datafile."); } - return createNewStorageIO(dvObject, storageTag, dvObject.getDataverseContext().getEffectiveStorageDriverId()); + if (dvObject instanceof Dataset) { + return createNewStorageIO(dvObject, storageTag, ((Dataset)dvObject).getEffectiveStorageDriverId()); + } + // it's a DataFile: + return createNewStorageIO(dvObject, storageTag, dvObject.getOwner().getEffectiveStorageDriverId()); } public static StorageIO createNewStorageIO(T dvObject, String storageTag, String storageDriverId) throws IOException { diff --git a/src/main/java/edu/harvard/iq/dataverse/dataaccess/DataConverter.java b/src/main/java/edu/harvard/iq/dataverse/dataaccess/DataConverter.java index a627df0dbb3..2c60b51a525 100644 --- a/src/main/java/edu/harvard/iq/dataverse/dataaccess/DataConverter.java +++ b/src/main/java/edu/harvard/iq/dataverse/dataaccess/DataConverter.java @@ -26,6 +26,9 @@ import java.io.IOException; import java.util.logging.Logger; + +import org.apache.commons.io.IOUtils; + import java.util.List; import java.util.Map; import java.util.HashMap; @@ -171,7 +174,9 @@ public static File downloadFromStorageIO(StorageIO storageIO) { } else { try { storageIO.open(); - return downloadFromByteChannel(storageIO.getReadChannel(), storageIO.getSize()); + try (ReadableByteChannel tabFileChannel = storageIO.getReadChannel()) { + return downloadFromByteChannel(tabFileChannel, storageIO.getSize()); + } } catch (IOException ex) { logger.warning("caught IOException trying to store tabular file " + storageIO.getDataFile().getStorageIdentifier() + " as a temp file."); } @@ -184,12 +189,13 @@ private static File downloadFromByteChannel(ReadableByteChannel tabFileChannel, logger.fine("opening datafFileIO for the source tabular file..."); File tabFile = File.createTempFile("tempTabFile", ".tmp"); - FileChannel tempFileChannel = new FileOutputStream(tabFile).getChannel(); - tempFileChannel.transferFrom(tabFileChannel, 0, size); - return tabFile; + try (FileChannel tempFileChannel = new FileOutputStream(tabFile).getChannel();) { + tempFileChannel.transferFrom(tabFileChannel, 0, size); + return tabFile; + } } catch (IOException ioex) { logger.warning("caught IOException trying to store tabular file as a temp file."); - } + } return null; } @@ -237,8 +243,10 @@ private static File runFormatConversion (DataFile file, File tabFile, String for try { StorageIO storageIO = file.getStorageIO(); long size = storageIO.getAuxObjectSize("orig"); - File origFile = downloadFromByteChannel((ReadableByteChannel) storageIO.openAuxChannel("orig"), size); - resultInfo = dfs.directConvert(origFile, origFormat); + try (ReadableByteChannel origChannel = (ReadableByteChannel) storageIO.openAuxChannel("orig")) { + File origFile = downloadFromByteChannel(origChannel, size); + resultInfo = dfs.directConvert(origFile, origFormat); + } } catch (IOException ex) { ex.printStackTrace(); return null; diff --git a/src/main/java/edu/harvard/iq/dataverse/dataaccess/ImageThumbConverter.java b/src/main/java/edu/harvard/iq/dataverse/dataaccess/ImageThumbConverter.java index 1249e95494e..ec18f23a5a0 100644 --- a/src/main/java/edu/harvard/iq/dataverse/dataaccess/ImageThumbConverter.java +++ b/src/main/java/edu/harvard/iq/dataverse/dataaccess/ImageThumbConverter.java @@ -235,6 +235,7 @@ private static boolean generatePDFThumbnail(StorageIO storageIO, int s return false; } finally { IOUtils.closeQuietly(tempFileChannel); + IOUtils.closeQuietly(pdfFileChannel); } sourcePdfFile = tempFile; } @@ -272,7 +273,9 @@ private static boolean generateImageThumbnail(StorageIO storageIO, int try { storageIO.open(); - return generateImageThumbnailFromInputStream(storageIO, size, storageIO.getInputStream()); + try(InputStream inputStream = storageIO.getInputStream()) { + return generateImageThumbnailFromInputStream(storageIO, size, inputStream); + } } catch (IOException ioex) { logger.warning("caught IOException trying to open an input stream for " + storageIO.getDataFile().getStorageIdentifier() + ioex); return false; @@ -312,6 +315,7 @@ private static boolean generateWorldMapThumbnail(StorageIO storageIO, worldMapImageInputStream.close(); return false; } + return generateImageThumbnailFromInputStream(storageIO, size, worldMapImageInputStream); } catch (FileNotFoundException fnfe) { logger.fine("No .img file for this worldmap file yet; giving up. Original Error: " + fnfe); return false; @@ -319,9 +323,10 @@ private static boolean generateWorldMapThumbnail(StorageIO storageIO, } catch (IOException ioex) { logger.warning("caught IOException trying to open an input stream for worldmap .img file (" + storageIO.getDataFile().getStorageIdentifier() + "). Original Error: " + ioex); return false; + } finally { + IOUtils.closeQuietly(worldMapImageInputStream); } - - return generateImageThumbnailFromInputStream(storageIO, size, worldMapImageInputStream); + } /* @@ -750,15 +755,14 @@ private static void rescaleImage(BufferedImage fullSizeImage, int width, int hei g2.drawImage(thumbImage, 0, 0, null); g2.dispose(); - try { - ImageOutputStream ios = ImageIO.createImageOutputStream(outputStream); + try (ImageOutputStream ios = ImageIO.createImageOutputStream(outputStream);) { + writer.setOutput(ios); // finally, save thumbnail image: writer.write(lowRes); writer.dispose(); - ios.close(); thumbImage.flush(); //fullSizeImage.flush(); lowRes.flush(); diff --git a/src/main/java/edu/harvard/iq/dataverse/dataaccess/S3AccessIO.java b/src/main/java/edu/harvard/iq/dataverse/dataaccess/S3AccessIO.java index 3e38d3cdc9c..0c4558edb30 100644 --- a/src/main/java/edu/harvard/iq/dataverse/dataaccess/S3AccessIO.java +++ b/src/main/java/edu/harvard/iq/dataverse/dataaccess/S3AccessIO.java @@ -1,6 +1,7 @@ package edu.harvard.iq.dataverse.dataaccess; import com.amazonaws.AmazonClientException; +import com.amazonaws.ClientConfiguration; import com.amazonaws.HttpMethod; import com.amazonaws.SdkClientException; import com.amazonaws.auth.AWSStaticCredentialsProvider; @@ -90,7 +91,14 @@ public S3AccessIO(T dvObject, DataAccessRequest req, String driverId) { minPartSize = getMinPartSize(driverId); s3=getClient(driverId); tm=getTransferManager(driverId); - + //Not sure this is needed but moving it from the open method for now since it definitely doesn't need to run every time an object is opened. + try { + if (bucketName == null || !s3.doesBucketExistV2(bucketName)) { + throw new IOException("ERROR: S3AccessIO - You must create and configure a bucket before creating datasets."); + } + } catch (SdkClientException sce) { + throw new IOException("ERROR: S3AccessIO - Failed to look up bucket "+bucketName+" (is AWS properly configured?): " + sce.getMessage()); + } } catch (Exception e) { throw new AmazonClientException( "Cannot instantiate a S3 client; check your AWS credentials and region", @@ -128,14 +136,6 @@ public void open(DataAccessOption... options) throws IOException { throw new IOException("ERROR: s3 not initialised. "); } - try { - if (bucketName == null || !s3.doesBucketExist(bucketName)) { - throw new IOException("ERROR: S3AccessIO - You must create and configure a bucket before creating datasets."); - } - } catch (SdkClientException sce) { - throw new IOException("ERROR: S3AccessIO - Failed to look up bucket "+bucketName+" (is AWS properly configured?): " + sce.getMessage()); - } - DataAccessRequest req = this.getRequest(); if (isWriteAccessRequested(options)) { @@ -582,18 +582,20 @@ public void saveInputStreamAsAux(InputStream inputStream, String auxItemTag) thr //Helper method for supporting saving streams with unknown length to S3 //We save those streams to a file and then upload the file - private File createTempFile(Path path, InputStream inputStream) throws IOException { + private File createTempFile(Path path, InputStream inputStream) throws IOException { + + File targetFile = new File(path.toUri()); // File needs a name + try (OutputStream outStream = new FileOutputStream(targetFile);) { - File targetFile = new File(path.toUri()); //File needs a name - OutputStream outStream = new FileOutputStream(targetFile); + byte[] buffer = new byte[8 * 1024]; + int bytesRead; + while ((bytesRead = inputStream.read(buffer)) != -1) { + outStream.write(buffer, 0, bytesRead); + } - byte[] buffer = new byte[8 * 1024]; - int bytesRead; - while ((bytesRead = inputStream.read(buffer)) != -1) { - outStream.write(buffer, 0, bytesRead); + } finally { + IOUtils.closeQuietly(inputStream); } - IOUtils.closeQuietly(inputStream); - IOUtils.closeQuietly(outStream); return targetFile; } @@ -1087,6 +1089,11 @@ private static AmazonS3 getClient(String driverId) { // get a standard client, using the standard way of configuration the credentials, etc. AmazonS3ClientBuilder s3CB = AmazonS3ClientBuilder.standard(); + ClientConfiguration cc = new ClientConfiguration(); + Integer poolSize = Integer.getInteger("dataverse.files." + driverId + ".connection-pool-size", 256); + cc.setMaxConnections(poolSize); + s3CB.setClientConfiguration(cc); + /** * Pass in a URL pointing to your S3 compatible storage. * For possible values see https://docs.aws.amazon.com/AWSJavaSDK/latest/javadoc/com/amazonaws/client/builder/AwsClientBuilder.EndpointConfiguration.html diff --git a/src/main/java/edu/harvard/iq/dataverse/dataaccess/StoredOriginalFile.java b/src/main/java/edu/harvard/iq/dataverse/dataaccess/StoredOriginalFile.java index 5be7882bc5e..0955d5f5565 100644 --- a/src/main/java/edu/harvard/iq/dataverse/dataaccess/StoredOriginalFile.java +++ b/src/main/java/edu/harvard/iq/dataverse/dataaccess/StoredOriginalFile.java @@ -26,6 +26,8 @@ import java.nio.channels.Channels; import java.nio.channels.ReadableByteChannel; import java.util.logging.Logger; + +import org.apache.tika.io.IOUtils; /** * * @author Leonid Andreev @@ -56,17 +58,18 @@ public static StorageIO retreive(StorageIO storageIO) { long storedOriginalSize; InputStreamIO inputStreamIO; - + Channel storedOriginalChannel = null; try { storageIO.open(); - Channel storedOriginalChannel = storageIO.openAuxChannel(SAVED_ORIGINAL_FILENAME_EXTENSION); + storedOriginalChannel = storageIO.openAuxChannel(SAVED_ORIGINAL_FILENAME_EXTENSION); storedOriginalSize = dataFile.getDataTable().getOriginalFileSize() != null ? dataFile.getDataTable().getOriginalFileSize() : storageIO.getAuxObjectSize(SAVED_ORIGINAL_FILENAME_EXTENSION); inputStreamIO = new InputStreamIO(Channels.newInputStream((ReadableByteChannel) storedOriginalChannel), storedOriginalSize); logger.fine("Opened stored original file as Aux "+SAVED_ORIGINAL_FILENAME_EXTENSION); } catch (IOException ioEx) { - // The original file not saved, or could not be opened. + IOUtils.closeQuietly(storedOriginalChannel); + // The original file not saved, or could not be opened. logger.fine("Failed to open stored original file as Aux "+SAVED_ORIGINAL_FILENAME_EXTENSION+"!"); return null; } diff --git a/src/main/java/edu/harvard/iq/dataverse/dataset/DatasetUtil.java b/src/main/java/edu/harvard/iq/dataverse/dataset/DatasetUtil.java index 9c61079e3f6..69e35b8e8c8 100644 --- a/src/main/java/edu/harvard/iq/dataverse/dataset/DatasetUtil.java +++ b/src/main/java/edu/harvard/iq/dataverse/dataset/DatasetUtil.java @@ -273,7 +273,7 @@ public static Dataset persistDatasetLogoToStorageAndCreateThumbnails(Dataset dat try { tmpFile = FileUtil.inputStreamToFile(inputStream); } catch (IOException ex) { - logger.severe(ex.getMessage()); + logger.severe(ex.getMessage()); } StorageIO dataAccess = null; @@ -298,11 +298,13 @@ public static Dataset persistDatasetLogoToStorageAndCreateThumbnails(Dataset dat try { fullSizeImage = ImageIO.read(tmpFile); } catch (IOException ex) { + IOUtils.closeQuietly(inputStream); logger.severe(ex.getMessage()); return null; } if (fullSizeImage == null) { logger.fine("fullSizeImage was null!"); + IOUtils.closeQuietly(inputStream); return null; } int width = fullSizeImage.getWidth(); @@ -311,6 +313,7 @@ public static Dataset persistDatasetLogoToStorageAndCreateThumbnails(Dataset dat try { src = new FileInputStream(tmpFile).getChannel(); } catch (FileNotFoundException ex) { + IOUtils.closeQuietly(inputStream); logger.severe(ex.getMessage()); return null; } @@ -318,6 +321,7 @@ public static Dataset persistDatasetLogoToStorageAndCreateThumbnails(Dataset dat try { dest = new FileOutputStream(tmpFile).getChannel(); } catch (FileNotFoundException ex) { + IOUtils.closeQuietly(inputStream); logger.severe(ex.getMessage()); return null; } @@ -329,10 +333,13 @@ public static Dataset persistDatasetLogoToStorageAndCreateThumbnails(Dataset dat } File tmpFileForResize = null; try { + //The stream was used around line 274 above, so this creates an empty file (OK since all it is used for is getting a path, but not reusing it here would make it easier to close it above.) tmpFileForResize = FileUtil.inputStreamToFile(inputStream); } catch (IOException ex) { logger.severe(ex.getMessage()); return null; + } finally { + IOUtils.closeQuietly(inputStream); } // We'll try to pre-generate the rescaled versions in both the // DEFAULT_DATASET_LOGO (currently 140) and DEFAULT_CARDIMAGE_SIZE (48) @@ -447,7 +454,7 @@ public static boolean isAppropriateStorageDriver(Dataset dataset){ // instead of testing for the 's3" store, //This method is used by both the dataset and edit files page so one change here //will fix both - return dataset.getDataverseContext().getEffectiveStorageDriverId().equals("s3"); + return dataset.getEffectiveStorageDriverId().equals("s3"); } /** diff --git a/src/main/java/edu/harvard/iq/dataverse/datasetutility/AddReplaceFileHelper.java b/src/main/java/edu/harvard/iq/dataverse/datasetutility/AddReplaceFileHelper.java index 4928100dfff..ab34b5b2675 100644 --- a/src/main/java/edu/harvard/iq/dataverse/datasetutility/AddReplaceFileHelper.java +++ b/src/main/java/edu/harvard/iq/dataverse/datasetutility/AddReplaceFileHelper.java @@ -1501,7 +1501,7 @@ private boolean step_060_addFilesViaIngestService(){ } int nFiles = finalFileList.size(); - finalFileList = ingestService.saveAndAddFilesToDataset(workingVersion, finalFileList); + finalFileList = ingestService.saveAndAddFilesToDataset(workingVersion, finalFileList, isFileReplaceOperation()); if (nFiles != finalFileList.size()) { if (nFiles == 1) { diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractCreateDatasetCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractCreateDatasetCommand.java index 13652b93f75..2046e4b107e 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractCreateDatasetCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractCreateDatasetCommand.java @@ -96,7 +96,7 @@ public Dataset execute(CommandContext ctxt) throws CommandException { theDataset.setAuthority(ctxt.settings().getValueForKey(SettingsServiceBean.Key.Authority, nonNullDefaultIfKeyNotFound)); } if (theDataset.getStorageIdentifier() == null) { - String driverId = theDataset.getDataverseContext().getEffectiveStorageDriverId(); + String driverId = theDataset.getEffectiveStorageDriverId(); theDataset.setStorageIdentifier(driverId + "://" + theDataset.getAuthorityForFileStorage() + "/" + theDataset.getIdentifierForFileStorage()); } if (theDataset.getIdentifier()==null) { diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DuraCloudSubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DuraCloudSubmitToArchiveCommand.java index 66e8770a641..468e99f24c1 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DuraCloudSubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DuraCloudSubmitToArchiveCommand.java @@ -99,7 +99,12 @@ public void run() { } } }).start(); - + //Have seen Pipe Closed errors for other archivers when used as a workflow without this delay loop + int i=0; + while(digestInputStream.available()<=0 && i<100) { + Thread.sleep(10); + i++; + } String checksum = store.addContent(spaceName, "datacite.xml", digestInputStream, -1l, null, null, null); logger.fine("Content: datacite.xml added with checksum: " + checksum); @@ -133,7 +138,11 @@ public void run() { } } }).start(); - + i=0; + while(digestInputStream.available()<=0 && i<100) { + Thread.sleep(10); + i++; + } checksum = store.addContent(spaceName, fileName, digestInputStream2, -1l, null, null, null); logger.fine("Content: " + fileName + " added with checksum: " + checksum); @@ -174,6 +183,9 @@ public void run() { logger.severe(rte.getMessage()); return new Failure("Error in generating datacite.xml file", "DuraCloud Submission Failure: metadata file not created"); + } catch (InterruptedException e) { + logger.warning(e.getLocalizedMessage()); + e.printStackTrace(); } } catch (ContentStoreException e) { logger.warning(e.getMessage()); diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/FinalizeDatasetPublicationCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/FinalizeDatasetPublicationCommand.java index 92bd22eb902..ce407d8986b 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/FinalizeDatasetPublicationCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/FinalizeDatasetPublicationCommand.java @@ -66,6 +66,8 @@ public FinalizeDatasetPublicationCommand(Dataset aDataset, DataverseRequest aReq public Dataset execute(CommandContext ctxt) throws CommandException { Dataset theDataset = getDataset(); + logger.info("Finalizing publication of the dataset "+theDataset.getGlobalId().asString()); + // validate the physical files before we do anything else: // (unless specifically disabled; or a minor version) if (theDataset.getLatestVersion().getVersionState() != RELEASED @@ -91,6 +93,7 @@ public Dataset execute(CommandContext ctxt) throws CommandException { registerExternalIdentifier(theDataset, ctxt, false); } catch (CommandException comEx) { + logger.warning("Failed to reserve the identifier "+theDataset.getGlobalId().asString()+"; notifying the user(s), unlocking the dataset"); // Send failure notification to the user: notifyUsersDatasetPublishStatus(ctxt, theDataset, UserNotification.Type.PUBLISHFAILED_PIDREG); // Remove the dataset lock: @@ -197,6 +200,9 @@ public Dataset execute(CommandContext ctxt) throws CommandException { ctxt.datasets().removeDatasetLocks(theDataset, DatasetLock.Reason.InReview); } + logger.info("Successfully published the dataset "+theDataset.getGlobalId().asString()); + + return readyDataset; } @@ -355,6 +361,8 @@ private void publicizeExternalIdentifier(Dataset dataset, CommandContext ctxt) t dataset.setGlobalIdCreateTime(new Date()); // TODO these two methods should be in the responsibility of the idServiceBean. dataset.setIdentifierRegistered(true); } catch (Throwable e) { + logger.warning("Failed to register the identifier "+dataset.getGlobalId().asString()+", or to register a file in the dataset; notifying the user(s), unlocking the dataset"); + // Send failure notification to the user: notifyUsersDatasetPublishStatus(ctxt, dataset, UserNotification.Type.PUBLISHFAILED_PIDREG); diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/GoogleCloudSubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/GoogleCloudSubmitToArchiveCommand.java new file mode 100644 index 00000000000..cb729a9807a --- /dev/null +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/GoogleCloudSubmitToArchiveCommand.java @@ -0,0 +1,228 @@ +package edu.harvard.iq.dataverse.engine.command.impl; + +import edu.harvard.iq.dataverse.DOIDataCiteRegisterService; +import edu.harvard.iq.dataverse.DataCitation; +import edu.harvard.iq.dataverse.Dataset; +import edu.harvard.iq.dataverse.DatasetVersion; +import edu.harvard.iq.dataverse.DatasetLock.Reason; +import edu.harvard.iq.dataverse.authorization.Permission; +import edu.harvard.iq.dataverse.authorization.users.ApiToken; +import edu.harvard.iq.dataverse.engine.command.Command; +import edu.harvard.iq.dataverse.engine.command.DataverseRequest; +import edu.harvard.iq.dataverse.engine.command.RequiredPermissions; +import edu.harvard.iq.dataverse.util.bagit.BagGenerator; +import edu.harvard.iq.dataverse.util.bagit.OREMap; +import edu.harvard.iq.dataverse.workflow.step.Failure; +import edu.harvard.iq.dataverse.workflow.step.WorkflowStepResult; + +import java.io.BufferedInputStream; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.PipedInputStream; +import java.io.PipedOutputStream; +import java.nio.charset.Charset; +import java.security.DigestInputStream; +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; +import java.util.Map; +import java.util.logging.Logger; + +import org.apache.commons.codec.binary.Hex; +import com.google.auth.oauth2.ServiceAccountCredentials; +import com.google.cloud.storage.Blob; +import com.google.cloud.storage.Bucket; +import com.google.cloud.storage.Storage; +import com.google.cloud.storage.StorageOptions; + +@RequiredPermissions(Permission.PublishDataset) +public class GoogleCloudSubmitToArchiveCommand extends AbstractSubmitToArchiveCommand implements Command { + + private static final Logger logger = Logger.getLogger(GoogleCloudSubmitToArchiveCommand.class.getName()); + private static final String GOOGLECLOUD_BUCKET = ":GoogleCloudBucket"; + private static final String GOOGLECLOUD_PROJECT = ":GoogleCloudProject"; + + public GoogleCloudSubmitToArchiveCommand(DataverseRequest aRequest, DatasetVersion version) { + super(aRequest, version); + } + + @Override + public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken token, Map requestedSettings) { + logger.fine("In GoogleCloudSubmitToArchiveCommand..."); + String bucketName = requestedSettings.get(GOOGLECLOUD_BUCKET); + String projectName = requestedSettings.get(GOOGLECLOUD_PROJECT); + logger.fine("Project: " + projectName + " Bucket: " + bucketName); + if (bucketName != null && projectName != null) { + Storage storage; + try { + FileInputStream fis = new FileInputStream(System.getProperty("dataverse.files.directory") + System.getProperty("file.separator")+ "googlecloudkey.json"); + storage = StorageOptions.newBuilder() + .setCredentials(ServiceAccountCredentials.fromStream(fis)) + .setProjectId(projectName) + .build() + .getService(); + Bucket bucket = storage.get(bucketName); + + Dataset dataset = dv.getDataset(); + if (dataset.getLockFor(Reason.finalizePublication) == null) { + + String spaceName = dataset.getGlobalId().asString().replace(':', '-').replace('/', '-') + .replace('.', '-').toLowerCase(); + + DataCitation dc = new DataCitation(dv); + Map metadata = dc.getDataCiteMetadata(); + String dataciteXml = DOIDataCiteRegisterService.getMetadataFromDvObject( + dv.getDataset().getGlobalId().asString(), metadata, dv.getDataset()); + String blobIdString = null; + MessageDigest messageDigest = MessageDigest.getInstance("MD5"); + try (PipedInputStream dataciteIn = new PipedInputStream(); DigestInputStream digestInputStream = new DigestInputStream(dataciteIn, messageDigest)) { + // Add datacite.xml file + + new Thread(new Runnable() { + public void run() { + try (PipedOutputStream dataciteOut = new PipedOutputStream(dataciteIn)) { + + dataciteOut.write(dataciteXml.getBytes(Charset.forName("utf-8"))); + dataciteOut.close(); + } catch (Exception e) { + logger.severe("Error creating datacite.xml: " + e.getMessage()); + // TODO Auto-generated catch block + e.printStackTrace(); + throw new RuntimeException("Error creating datacite.xml: " + e.getMessage()); + } + } + }).start(); + //Have seen broken pipe in PostPublishDataset workflow without this delay + int i=0; + while(digestInputStream.available()<=0 && i<100) { + Thread.sleep(10); + i++; + } + Blob dcXml = bucket.create(spaceName + "/datacite.v" + dv.getFriendlyVersionNumber()+".xml", digestInputStream, "text/xml", Bucket.BlobWriteOption.doesNotExist()); + String checksum = dcXml.getMd5ToHexString(); + logger.fine("Content: datacite.xml added with checksum: " + checksum); + String localchecksum = Hex.encodeHexString(digestInputStream.getMessageDigest().digest()); + if (!checksum.equals(localchecksum)) { + logger.severe(checksum + " not equal to " + localchecksum); + return new Failure("Error in transferring DataCite.xml file to GoogleCloud", + "GoogleCloud Submission Failure: incomplete metadata transfer"); + } + + // Store BagIt file + String fileName = spaceName + ".v" + dv.getFriendlyVersionNumber() + ".zip"; + + // Add BagIt ZIP file + // Google uses MD5 as one way to verify the + // transfer + messageDigest = MessageDigest.getInstance("MD5"); + try (PipedInputStream in = new PipedInputStream(100000); DigestInputStream digestInputStream2 = new DigestInputStream(in, messageDigest);) { + Thread writeThread = new Thread(new Runnable() { + public void run() { + try (PipedOutputStream out = new PipedOutputStream(in)) { + // Generate bag + BagGenerator bagger = new BagGenerator(new OREMap(dv, false), dataciteXml); + bagger.setAuthenticationKey(token.getTokenString()); + bagger.generateBag(out); + } catch (Exception e) { + logger.severe("Error creating bag: " + e.getMessage()); + // TODO Auto-generated catch block + e.printStackTrace(); + try { + digestInputStream2.close(); + } catch(Exception ex) { + logger.warning(ex.getLocalizedMessage()); + } + throw new RuntimeException("Error creating bag: " + e.getMessage()); + } + } + }); + writeThread.start(); + /* + * The following loop handles two issues. First, with no delay, the + * bucket.create() call below can get started before the piped streams are set + * up, causing a failure (seen when triggered in a PostPublishDataset workflow). + * A minimal initial wait, e.g. until some bytes are available, would address + * this. Second, the BagGenerator class, due to it's use of parallel streaming + * creation of the zip file, has the characteristic that it makes a few bytes + * available - from setting up the directory structure for the zip file - + * significantly earlier than it is ready to stream file content (e.g. for + * thousands of files and GB of content). If, for these large datasets, + * bucket.create() is called as soon as bytes are available, the call can + * timeout before the bytes for all the zipped files are available. To manage + * this, the loop waits until 90K bytes are available, larger than any expected + * dir structure for the zip and implying that the main zipped content is + * available, or until the thread terminates, with all of its content written to + * the pipe. (Note the PipedInputStream buffer is set at 100K above - I didn't + * want to test whether that means that exactly 100K bytes will be available() + * for large datasets or not, so the test below is at 90K.) + * + * An additional sanity check limits the wait to 2K seconds. The BagGenerator + * has been used to archive >120K files, 2K directories, and ~600GB files on the + * SEAD project (streaming content to disk rather than over an internet + * connection) which would take longer than 2K seconds (10+ hours) and might + * produce an initial set of bytes for directories > 90K. If Dataverse ever + * needs to support datasets of this size, the numbers here would need to be + * increased, and/or a change in how archives are sent to google (e.g. as + * multiple blobs that get aggregated) would be required. + */ + i=0; + while(digestInputStream2.available()<=90000 && i<2000 && writeThread.isAlive()) { + Thread.sleep(1000); + logger.fine("avail: " + digestInputStream2.available() + " : " + writeThread.getState().toString()); + i++; + } + logger.fine("Bag: transfer started, i=" + i + ", avail = " + digestInputStream2.available()); + if(i==2000) { + throw new IOException("Stream not available"); + } + Blob bag = bucket.create(spaceName + "/" + fileName, digestInputStream2, "application/zip", Bucket.BlobWriteOption.doesNotExist()); + if(bag.getSize()==0) { + throw new IOException("Empty Bag"); + } + blobIdString = bag.getBlobId().getBucket() + "/" + bag.getBlobId().getName(); + checksum = bag.getMd5ToHexString(); + logger.fine("Bag: " + fileName + " added with checksum: " + checksum); + localchecksum = Hex.encodeHexString(digestInputStream2.getMessageDigest().digest()); + if (!checksum.equals(localchecksum)) { + logger.severe(checksum + " not equal to " + localchecksum); + return new Failure("Error in transferring Zip file to GoogleCloud", + "GoogleCloud Submission Failure: incomplete archive transfer"); + } + } catch (RuntimeException rte) { + logger.severe("Error creating Bag during GoogleCloud archiving: " + rte.getMessage()); + return new Failure("Error in generating Bag", + "GoogleCloud Submission Failure: archive file not created"); + } + + logger.fine("GoogleCloud Submission step: Content Transferred"); + + // Document the location of dataset archival copy location (actually the URL + // where you can + // view it as an admin) + + StringBuffer sb = new StringBuffer("https://console.cloud.google.com/storage/browser/"); + sb.append(blobIdString); + dv.setArchivalCopyLocation(sb.toString()); + } catch (RuntimeException rte) { + logger.severe("Error creating datacite xml file during GoogleCloud Archiving: " + rte.getMessage()); + return new Failure("Error in generating datacite.xml file", + "GoogleCloud Submission Failure: metadata file not created"); + } + } else { + logger.warning("GoogleCloud Submision Workflow aborted: Dataset locked for pidRegister"); + return new Failure("Dataset locked"); + } + } catch (Exception e) { + logger.warning(e.getLocalizedMessage()); + e.printStackTrace(); + return new Failure("GoogleCloud Submission Failure", + e.getLocalizedMessage() + ": check log for details"); + + } + return WorkflowStepResult.OK; + } else { + return new Failure("GoogleCloud Submission not configured - no \":GoogleCloudBucket\" and/or \":GoogleCloudProject\"."); + } + } + +} diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/RedetectFileTypeCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/RedetectFileTypeCommand.java index 0477a483783..ef20ec76e12 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/RedetectFileTypeCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/RedetectFileTypeCommand.java @@ -53,11 +53,11 @@ public DataFile execute(CommandContext ctxt) throws CommandException { } else { // Need to create a temporary local file: - ReadableByteChannel targetFileChannel = (ReadableByteChannel) storageIO.getReadChannel(); tempFile = File.createTempFile("tempFileTypeCheck", ".tmp"); - FileChannel tempFileChannel = new FileOutputStream(tempFile).getChannel(); - tempFileChannel.transferFrom(targetFileChannel, 0, storageIO.getSize()); - + try (ReadableByteChannel targetFileChannel = (ReadableByteChannel) storageIO.getReadChannel(); + FileChannel tempFileChannel = new FileOutputStream(tempFile).getChannel();) { + tempFileChannel.transferFrom(targetFileChannel, 0, storageIO.getSize()); + } localFile = tempFile; } diff --git a/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java index 6743c19875a..f5eeaa1c316 100644 --- a/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java @@ -67,6 +67,8 @@ import edu.harvard.iq.dataverse.ingest.tabulardata.impl.plugins.por.PORFileReader; import edu.harvard.iq.dataverse.ingest.tabulardata.impl.plugins.por.PORFileReaderSpi; import edu.harvard.iq.dataverse.util.*; + +import org.apache.commons.io.IOUtils; //import edu.harvard.iq.dvn.unf.*; import org.dataverse.unf.*; import java.io.BufferedInputStream; @@ -153,7 +155,7 @@ public class IngestServiceBean { // DataFileCategory objects, if any were already assigned to the files). // It must be called before we attempt to permanently save the files in // the database by calling the Save command on the dataset and/or version. - public List saveAndAddFilesToDataset(DatasetVersion version, List newFiles) { + public List saveAndAddFilesToDataset(DatasetVersion version, List newFiles, boolean isReplaceOperation) { List ret = new ArrayList<>(); if (newFiles != null && newFiles.size() > 0) { @@ -162,9 +164,10 @@ public List saveAndAddFilesToDataset(DatasetVersion version, List" + new String(Hex.encodeHex(hdr4)) + "<-"); diff --git a/src/main/java/edu/harvard/iq/dataverse/search/IndexBatchServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/IndexBatchServiceBean.java index 3746bb1d92c..5171b1a864a 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/IndexBatchServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/IndexBatchServiceBean.java @@ -1,14 +1,15 @@ package edu.harvard.iq.dataverse.search; -import edu.harvard.iq.dataverse.Dataset; import edu.harvard.iq.dataverse.DatasetServiceBean; import edu.harvard.iq.dataverse.Dataverse; import edu.harvard.iq.dataverse.DataverseServiceBean; import edu.harvard.iq.dataverse.DvObjectServiceBean; import edu.harvard.iq.dataverse.util.SystemConfig; import java.io.IOException; +import java.util.ArrayList; import java.util.List; import java.util.concurrent.Future; +import java.util.logging.Level; import java.util.logging.Logger; import javax.ejb.AsyncResult; import javax.ejb.Asynchronous; @@ -44,11 +45,71 @@ public class IndexBatchServiceBean { DvObjectServiceBean dvObjectService; @EJB SystemConfig systemConfig; + + @Asynchronous + public Future indexStatus() { + JsonObjectBuilder response = Json.createObjectBuilder(); + logger.info("Beginning indexStatus()"); + JsonObject contentInDatabaseButStaleInOrMissingFromSolr = getContentInDatabaseButStaleInOrMissingFromSolr().build(); + JsonObject contentInSolrButNotDatabase = null; + JsonObject permissionsInSolrButNotDatabase = null; + try { + contentInSolrButNotDatabase = getContentInSolrButNotDatabase().build(); + permissionsInSolrButNotDatabase = getPermissionsInSolrButNotDatabase().build(); + + } catch (SearchException ex) { + String msg = "Can not determine index status. " + ex.getLocalizedMessage() + ". Is Solr down? Exception: " + ex.getCause().getLocalizedMessage(); + logger.info(msg); + response.add("SearchException ", msg); + return new AsyncResult<>(response); + } + + JsonObject permissionsInDatabaseButStaleInOrMissingFromSolr = getPermissionsInDatabaseButStaleInOrMissingFromSolr().build(); + + JsonObjectBuilder data = Json.createObjectBuilder() + .add("contentInDatabaseButStaleInOrMissingFromIndex", contentInDatabaseButStaleInOrMissingFromSolr) + .add("contentInIndexButNotDatabase", contentInSolrButNotDatabase) + .add("permissionsInDatabaseButStaleInOrMissingFromIndex", permissionsInDatabaseButStaleInOrMissingFromSolr) + .add("permissionsInIndexButNotDatabase", permissionsInSolrButNotDatabase); + + logger.log(Level.INFO, "contentInDatabaseButStaleInOrMissingFromIndex: {0}", contentInDatabaseButStaleInOrMissingFromSolr); + logger.log(Level.INFO, "contentInIndexButNotDatabase: {0}", contentInSolrButNotDatabase); + logger.log(Level.INFO, "permissionsInDatabaseButStaleInOrMissingFromIndex: {0}", permissionsInDatabaseButStaleInOrMissingFromSolr); + logger.log(Level.INFO, "permissionsInIndexButNotDatabase: {0}", permissionsInSolrButNotDatabase); + + return new AsyncResult<>(data); + } + @Asynchronous + public Future clearOrphans() { + JsonObjectBuilder response = Json.createObjectBuilder(); + List solrIds = new ArrayList<>(); + logger.info("Beginning clearOrphans() to check for orphan Solr documents."); + try { + logger.info("checking for orphans type dataverse"); + solrIds.addAll(indexService.findDataversesInSolrOnly()); + logger.info("checking for orphans type dataset"); + solrIds.addAll(indexService.findDatasetsInSolrOnly()); + logger.info("checking for orphans file"); + solrIds.addAll(indexService.findFilesInSolrOnly()); + logger.info("checking for orphan permissions"); + solrIds.addAll(indexService.findPermissionsInSolrOnly()); + } catch (SearchException e) { + logger.info("SearchException in clearOrphans: " + e.getMessage()); + response.add("response from clearOrphans","SearchException: " + e.getMessage() ); + } + logger.info("found " + solrIds.size()+ " orphan documents"); + IndexResponse resultOfSolrDeletionAttempt = solrIndexService.deleteMultipleSolrIds(solrIds); + logger.info(resultOfSolrDeletionAttempt.getMessage()); + response.add("resultOfSolrDeletionAttempt", resultOfSolrDeletionAttempt.getMessage()); + + return new AsyncResult<>(response); + } + @Asynchronous public Future indexAllOrSubset(long numPartitions, long partitionId, boolean skipIndexed, boolean previewOnly) { JsonObjectBuilder response = Json.createObjectBuilder(); - Future responseFromIndexAllOrSubset = indexAllOrSubset(numPartitions, partitionId, skipIndexed); + indexAllOrSubset(numPartitions, partitionId, skipIndexed); String status = "indexAllOrSubset has begun"; response.add("responseFromIndexAllOrSubset", status); return new AsyncResult<>(response); @@ -221,5 +282,85 @@ public void indexDataverseRecursively(Dataverse dataverse) { } logger.info(dataverseIndexCount + " dataverses and " + datasetIndexCount + " datasets indexed. Total time to index " + (end - start) + "."); } + private JsonObjectBuilder getContentInDatabaseButStaleInOrMissingFromSolr() { + logger.info("checking for stale or missing dataverses"); + List stateOrMissingDataverses = indexService.findStaleOrMissingDataverses(); + logger.info("checking for stale or missing datasets"); + List staleOrMissingDatasets = indexService.findStaleOrMissingDatasets(); + JsonArrayBuilder jsonStaleOrMissingDataverses = Json.createArrayBuilder(); + for (Long id : stateOrMissingDataverses) { + jsonStaleOrMissingDataverses.add(id); + } + JsonArrayBuilder datasetsInDatabaseButNotSolr = Json.createArrayBuilder(); + for (Long id : staleOrMissingDatasets) { + datasetsInDatabaseButNotSolr.add(id); + } + JsonObjectBuilder contentInDatabaseButStaleInOrMissingFromSolr = Json.createObjectBuilder() + /** + * @todo What about files? Currently files are always indexed + * along with their parent dataset + */ + .add("dataverses", jsonStaleOrMissingDataverses.build()) + .add("datasets", datasetsInDatabaseButNotSolr.build()); + logger.info("completed check for stale or missing content."); + return contentInDatabaseButStaleInOrMissingFromSolr; + } + + private JsonObjectBuilder getContentInSolrButNotDatabase() throws SearchException { + logger.info("checking for dataverses in Solr only"); + List dataversesInSolrOnly = indexService.findDataversesInSolrOnly(); + logger.info("checking for datasets in Solr only"); + List datasetsInSolrOnly = indexService.findDatasetsInSolrOnly(); + logger.info("checking for files in Solr only"); + List filesInSolrOnly = indexService.findFilesInSolrOnly(); + JsonArrayBuilder dataversesInSolrButNotDatabase = Json.createArrayBuilder(); + logger.info("completed check for content in Solr but not database"); + for (String dataverseId : dataversesInSolrOnly) { + dataversesInSolrButNotDatabase.add(dataverseId); + } + JsonArrayBuilder datasetsInSolrButNotDatabase = Json.createArrayBuilder(); + for (String datasetId : datasetsInSolrOnly) { + datasetsInSolrButNotDatabase.add(datasetId); + } + JsonArrayBuilder filesInSolrButNotDatabase = Json.createArrayBuilder(); + for (String fileId : filesInSolrOnly) { + filesInSolrButNotDatabase.add(fileId); + } + JsonObjectBuilder contentInSolrButNotDatabase = Json.createObjectBuilder() + /** + * @todo What about files? Currently files are always indexed + * along with their parent dataset + */ + .add("dataverses", dataversesInSolrButNotDatabase.build()) + .add("datasets", datasetsInSolrButNotDatabase.build()) + .add("files", filesInSolrButNotDatabase.build()); + + return contentInSolrButNotDatabase; + } + + private JsonObjectBuilder getPermissionsInDatabaseButStaleInOrMissingFromSolr() { + List staleOrMissingPermissions; + logger.info("checking for permissions in database but stale or missing from Solr"); + staleOrMissingPermissions = solrIndexService.findPermissionsInDatabaseButStaleInOrMissingFromSolr(); + logger.info("completed checking for permissions in database but stale or missing from Solr"); + JsonArrayBuilder stalePermissionList = Json.createArrayBuilder(); + for (Long dvObjectId : staleOrMissingPermissions) { + stalePermissionList.add(dvObjectId); + } + return Json.createObjectBuilder() + .add("dvobjects", stalePermissionList.build()); + } + + private JsonObjectBuilder getPermissionsInSolrButNotDatabase() throws SearchException { + + List staleOrMissingPermissions = indexService.findPermissionsInSolrOnly(); + JsonArrayBuilder stalePermissionList = Json.createArrayBuilder(); + for (String id : staleOrMissingPermissions) { + stalePermissionList.add(id); + } + return Json.createObjectBuilder() + .add("permissions", stalePermissionList.build()); + } + } diff --git a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java index b46c368e1d6..5b2d63c43eb 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java @@ -23,11 +23,9 @@ import edu.harvard.iq.dataverse.dataaccess.DataAccessRequest; import edu.harvard.iq.dataverse.dataaccess.StorageIO; import edu.harvard.iq.dataverse.datavariable.DataVariable; -import edu.harvard.iq.dataverse.datavariable.VariableMetadata; import edu.harvard.iq.dataverse.datavariable.VariableMetadataUtil; import edu.harvard.iq.dataverse.datavariable.VariableServiceBean; import edu.harvard.iq.dataverse.harvest.client.HarvestingClient; -import edu.harvard.iq.dataverse.harvest.server.OaiSetException; import edu.harvard.iq.dataverse.settings.SettingsServiceBean; import edu.harvard.iq.dataverse.util.FileUtil; import edu.harvard.iq.dataverse.util.StringUtil; @@ -35,15 +33,12 @@ import java.io.IOException; import java.io.InputStream; import java.sql.Timestamp; -import java.text.DateFormat; import java.text.SimpleDateFormat; -import java.time.Instant; import java.util.ArrayList; import java.util.Calendar; import java.util.Collection; import java.util.Date; import java.util.HashSet; -import java.util.Iterator; import java.util.LinkedHashMap; import java.util.List; import java.util.Locale; @@ -60,13 +55,13 @@ import javax.ejb.Stateless; import javax.ejb.TransactionAttribute; import static javax.ejb.TransactionAttributeType.REQUIRES_NEW; -import javax.inject.Inject; import javax.inject.Named; import javax.persistence.EntityManager; import javax.persistence.PersistenceContext; import org.apache.commons.lang.StringUtils; import org.apache.solr.client.solrj.SolrClient; import org.apache.solr.client.solrj.SolrQuery; +import org.apache.solr.client.solrj.SolrQuery.SortClause; import org.apache.solr.client.solrj.SolrServerException; import org.apache.solr.client.solrj.impl.HttpSolrClient; import org.apache.solr.client.solrj.response.QueryResponse; @@ -74,6 +69,7 @@ import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrDocumentList; import org.apache.solr.common.SolrInputDocument; +import org.apache.solr.common.params.CursorMarkParams; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.io.IOUtils; import org.apache.tika.metadata.Metadata; @@ -119,6 +115,9 @@ public class IndexServiceBean { @EJB VariableServiceBean variableService; + + @EJB + IndexBatchServiceBean indexBatchService; public static final String solrDocIdentifierDataverse = "dataverse_"; public static final String solrDocIdentifierFile = "datafile_"; @@ -161,7 +160,7 @@ public void close() { solrServer = null; } } - + @TransactionAttribute(REQUIRES_NEW) public Future indexDataverseInNewTransaction(Dataverse dataverse) throws SolrServerException, IOException{ return indexDataverse(dataverse, false); @@ -1661,46 +1660,28 @@ private String getDesiredCardState(Map des /** * @return Dataverses that should be reindexed either because they have * never been indexed or their index time is before their modification time. + * (Exclude root because it is never indexed) */ - public List findStaleOrMissingDataverses() { - List staleDataverses = new ArrayList<>(); - for (Dataverse dataverse : dataverseService.findAll()) { - if (dataverse.equals(dataverseService.findRootDataverse())) { - continue; - } - if (stale(dataverse)) { - staleDataverses.add(dataverse); - } - } - return staleDataverses; + public List findStaleOrMissingDataverses() { + List staleDataverseIds = dataverseService.findIdStale(); + Long rootId = dataverseService.findRootDataverse().getId(); + List ids = new ArrayList<>(); + staleDataverseIds.stream().filter(id -> (!id.equals(rootId))).forEachOrdered(id -> { + ids.add(id); + }); + return ids; } /** * @return Datasets that should be reindexed either because they have never * been indexed or their index time is before their modification time. */ - public List findStaleOrMissingDatasets() { - List staleDatasets = new ArrayList<>(); - for (Dataset dataset : datasetService.findAll()) { - if (stale(dataset)) { - staleDatasets.add(dataset); - } - } - return staleDatasets; - } - - private boolean stale(DvObject dvObject) { - Timestamp indexTime = dvObject.getIndexTime(); - Timestamp modificationTime = dvObject.getModificationTime(); - if (indexTime == null) { - return true; - } else if (indexTime.before(modificationTime)) { - return true; - } - return false; + public List findStaleOrMissingDatasets() { + return datasetService.findIdStale(); } - public List findDataversesInSolrOnly() throws SearchException { + + public List findDataversesInSolrOnly() throws SearchException { try { /** * @todo define this centrally and statically @@ -1711,7 +1692,7 @@ public List findDataversesInSolrOnly() throws SearchException { } } - public List findDatasetsInSolrOnly() throws SearchException { + public List findDatasetsInSolrOnly() throws SearchException { try { /** * @todo define this centrally and statically @@ -1722,7 +1703,7 @@ public List findDatasetsInSolrOnly() throws SearchException { } } - public List findFilesInSolrOnly() throws SearchException { + public List findFilesInSolrOnly() throws SearchException { try { /** * @todo define this centrally and statically @@ -1732,34 +1713,82 @@ public List findFilesInSolrOnly() throws SearchException { throw ex; } } - - private List findDvObjectInSolrOnly(String type) throws SearchException { - SolrQuery solrQuery = new SolrQuery(); - solrQuery.setQuery("*"); - solrQuery.setRows(Integer.MAX_VALUE); - solrQuery.addFilterQuery(SearchFields.TYPE + ":" + type); - List dvObjectInSolrOnly = new ArrayList<>(); - QueryResponse queryResponse = null; + /** + * Finds permissions documents in Solr that don't have corresponding dvObjects + * in the database, and returns a list of their Solr "id" field. + * @return list of "id" field vales for the orphaned Solr permission documents + * @throws SearchException + */ + public List findPermissionsInSolrOnly() throws SearchException { + List permissionInSolrOnly = new ArrayList<>(); try { - queryResponse = solrClientService.getSolrClient().query(solrQuery); + int rows = 100; + SolrQuery q = (new SolrQuery(SearchFields.DEFINITION_POINT_DVOBJECT_ID+":*")).setRows(rows).setSort(SortClause.asc(SearchFields.ID)); + String cursorMark = CursorMarkParams.CURSOR_MARK_START; + boolean done = false; + while (!done) { + q.set(CursorMarkParams.CURSOR_MARK_PARAM, cursorMark); + QueryResponse rsp = solrServer.query(q); + String nextCursorMark = rsp.getNextCursorMark(); + SolrDocumentList list = rsp.getResults(); + for (SolrDocument doc: list) { + long id = Long.parseLong((String) doc.getFieldValue(SearchFields.DEFINITION_POINT_DVOBJECT_ID)); + if(!dvObjectService.checkExists(id)) { + permissionInSolrOnly.add((String)doc.getFieldValue(SearchFields.ID)); + } + } + if (cursorMark.equals(nextCursorMark)) { + done = true; + } + cursorMark = nextCursorMark; + } } catch (SolrServerException | IOException ex) { - throw new SearchException("Error searching Solr for " + type, ex); + throw new SearchException("Error searching Solr for permissions" , ex); + } - SolrDocumentList results = queryResponse.getResults(); - for (SolrDocument solrDocument : results) { - Object idObject = solrDocument.getFieldValue(SearchFields.ENTITY_ID); - if (idObject != null) { - try { - long id = (Long) idObject; - DvObject dvobject = dvObjectService.findDvObject(id); - if (dvobject == null) { - dvObjectInSolrOnly.add(id); + return permissionInSolrOnly; + } + + private List findDvObjectInSolrOnly(String type) throws SearchException { + SolrQuery solrQuery = new SolrQuery(); + int rows = 100; + + solrQuery.setQuery("*").setRows(rows).setSort(SortClause.asc(SearchFields.ID)); + solrQuery.addFilterQuery(SearchFields.TYPE + ":" + type); + List dvObjectInSolrOnly = new ArrayList<>(); + + String cursorMark = CursorMarkParams.CURSOR_MARK_START; + boolean done = false; + while (!done) { + solrQuery.set(CursorMarkParams.CURSOR_MARK_PARAM, cursorMark); + QueryResponse rsp = null; + try { + rsp = solrServer.query(solrQuery); + } catch (SolrServerException | IOException ex) { + throw new SearchException("Error searching Solr type: " + type, ex); + + } + String nextCursorMark = rsp.getNextCursorMark(); + SolrDocumentList list = rsp.getResults(); + for (SolrDocument doc: list) { + Object idObject = doc.getFieldValue(SearchFields.ENTITY_ID); + if (idObject != null) { + try { + long id = (Long) idObject; + if (!dvObjectService.checkExists(id)) { + dvObjectInSolrOnly.add((String)doc.getFieldValue(SearchFields.ID)); + } + } catch (ClassCastException ex) { + throw new SearchException("Found " + SearchFields.ENTITY_ID + " but error casting " + idObject + " to long", ex); } - } catch (ClassCastException ex) { - throw new SearchException("Found " + SearchFields.ENTITY_ID + " but error casting " + idObject + " to long", ex); } } + if (cursorMark.equals(nextCursorMark)) { + done = true; + } + cursorMark = nextCursorMark; } + return dvObjectInSolrOnly; } diff --git a/src/main/java/edu/harvard/iq/dataverse/search/SearchIncludeFragment.java b/src/main/java/edu/harvard/iq/dataverse/search/SearchIncludeFragment.java index f5430ae32bb..ac70072b7bb 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/SearchIncludeFragment.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/SearchIncludeFragment.java @@ -130,7 +130,6 @@ public class SearchIncludeFragment implements java.io.Serializable { Map staticSolrFieldFriendlyNamesBySolrField = new HashMap<>(); private boolean solrIsDown = false; private Map numberOfFacets = new HashMap<>(); - private boolean debug = false; // private boolean showUnpublished; List filterQueriesDebug = new ArrayList<>(); // private Map friendlyName = new HashMap<>(); @@ -1017,15 +1016,6 @@ public void setRootDv(boolean rootDv) { this.rootDv = rootDv; } - public boolean isDebug() { - return (debug && session.getUser().isSuperuser()) - || settingsWrapper.isTrueForKey(":Debug", false); - } - - public void setDebug(boolean debug) { - this.debug = debug; - } - public List getFilterQueriesDebug() { return filterQueriesDebug; } diff --git a/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java index 14e9869aab3..ef4422e8d89 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java @@ -22,16 +22,12 @@ import java.util.Set; import java.util.logging.Level; import java.util.logging.Logger; -import javax.annotation.PostConstruct; -import javax.annotation.PreDestroy; import javax.ejb.EJB; import javax.ejb.Stateless; import javax.inject.Named; import javax.json.Json; import javax.json.JsonObjectBuilder; -import org.apache.solr.client.solrj.SolrClient; import org.apache.solr.client.solrj.SolrServerException; -import org.apache.solr.client.solrj.impl.HttpSolrClient; import org.apache.solr.client.solrj.response.UpdateResponse; import org.apache.solr.common.SolrInputDocument; @@ -532,47 +528,29 @@ public JsonObjectBuilder deleteAllFromSolrAndResetIndexTimes() throws SolrServer } /** - * @todo Do we want to report the root dataverse (id 1, often) in - * permissionsInDatabaseButMissingFromSolr? + * * * @return A list of dvobject ids that should have their permissions - * re-indexed Solr was down when a permission was added. The permission - * should be added to Solr. + * re-indexed because Solr was down when a permission was added. The permission + * should be added to Solr. The id of the permission contains the type of + * DvObject and the primary key of the dvObject. + * DvObjects of type DataFile are currently skipped because their index + * time isn't stored in the database, since they are indexed along + * with their parent dataset (this may change). */ public List findPermissionsInDatabaseButStaleInOrMissingFromSolr() { List indexingRequired = new ArrayList<>(); long rootDvId = dataverseService.findRootDataverse().getId(); - for (DvObject dvObject : dvObjectService.findAll()) { -// logger.info("examining dvObjectId " + dvObject.getId() + "..."); - Timestamp permissionModificationTime = dvObject.getPermissionModificationTime(); - Timestamp permissionIndexTime = dvObject.getPermissionIndexTime(); - if (permissionIndexTime == null) { - if (dvObject.getId() != rootDvId) { - // we don't index the rootDv - indexingRequired.add(dvObject.getId()); - } - } else if (permissionModificationTime == null) { - /** - * @todo What should we do here? Permissions should always be - * there. They are assigned at create time. - */ - logger.info("no permission modification time for dvobject id " + dvObject.getId()); - } else if (permissionIndexTime.before(permissionModificationTime)) { - indexingRequired.add(dvObject.getId()); + List missingDataversePermissionIds = dataverseService.findIdStalePermission(); + List missingDatasetPermissionIds = datasetService.findIdStalePermission(); + for (Long id : missingDataversePermissionIds) { + if (!id.equals(rootDvId)) { + indexingRequired.add(id); } } + indexingRequired.addAll(missingDatasetPermissionIds); return indexingRequired; } - /** - * @return A list of dvobject ids that should have their permissions - * re-indexed because Solr was down when a permission was revoked. The - * permission should be removed from Solr. - */ - public List findPermissionsInSolrNoLongerInDatabase() { - /** - * @todo Implement this! - */ - return new ArrayList<>(); - } + } diff --git a/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java b/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java index f9ee57a07d5..33d1ec51da2 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java @@ -697,143 +697,153 @@ private static String checksumDigestToString(byte[] digestBytes) { } public static String generateOriginalExtension(String fileType) { - if (fileType.equalsIgnoreCase("application/x-spss-sav")) { return ".sav"; } else if (fileType.equalsIgnoreCase("application/x-spss-por")) { - return ".por"; - } else if (fileType.equalsIgnoreCase("application/x-stata")) { + return ".por"; + // in addition to "application/x-stata" we want to support + // "application/x-stata-13" ... etc.: + } else if (fileType.toLowerCase().startsWith("application/x-stata")) { return ".dta"; - } else if (fileType.equalsIgnoreCase( "application/x-rlang-transport")) { + } else if (fileType.equalsIgnoreCase("application/x-dvn-csvspss-zip")) { + return ".zip"; + } else if (fileType.equalsIgnoreCase("application/x-dvn-tabddi-zip")) { + return ".zip"; + } else if (fileType.equalsIgnoreCase("application/x-rlang-transport")) { return ".RData"; - } else if (fileType.equalsIgnoreCase("text/csv")) { + } else if (fileType.equalsIgnoreCase("text/csv") || fileType.equalsIgnoreCase("text/comma-separated-values")) { return ".csv"; - } else if (fileType.equalsIgnoreCase( "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")) { + } else if (fileType.equalsIgnoreCase("text/tsv") || fileType.equalsIgnoreCase("text/tab-separated-values")) { + return ".tsv"; + } else if (fileType.equalsIgnoreCase("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")) { return ".xlsx"; } - return ""; } public static List createDataFiles(DatasetVersion version, InputStream inputStream, String fileName, String suppliedContentType, String newStorageIdentifier, String newCheckSum, SystemConfig systemConfig) throws IOException { - List datafiles = new ArrayList<>(); - - String warningMessage = null; - + List datafiles = new ArrayList<>(); + + String warningMessage = null; + // save the file, in the temporary location for now: - Path tempFile = null; - - Long fileSizeLimit = systemConfig.getMaxFileUploadSizeForStore(version.getDataset().getOwner().getEffectiveStorageDriverId()); - String finalType = null; - if (newStorageIdentifier == null) { - if (getFilesTempDirectory() != null) { - tempFile = Files.createTempFile(Paths.get(getFilesTempDirectory()), "tmp", "upload"); - // "temporary" location is the key here; this is why we are not using - // the DataStore framework for this - the assumption is that - // temp files will always be stored on the local filesystem. - // -- L.A. Jul. 2014 - logger.fine("Will attempt to save the file as: " + tempFile.toString()); - Files.copy(inputStream, tempFile, StandardCopyOption.REPLACE_EXISTING); - - // A file size check, before we do anything else: - // (note that "no size limit set" = "unlimited") - // (also note, that if this is a zip file, we'll be checking - // the size limit for each of the individual unpacked files) - Long fileSize = tempFile.toFile().length(); - if (fileSizeLimit != null && fileSize > fileSizeLimit) { - try {tempFile.toFile().delete();} catch (Exception ex) {} - throw new IOException (MessageFormat.format(BundleUtil.getStringFromBundle("file.addreplace.error.file_exceeds_limit"), bytesToHumanReadable(fileSize), bytesToHumanReadable(fileSizeLimit))); - } - - } else { - throw new IOException("Temp directory is not configured."); - } - logger.fine("mime type supplied: " + suppliedContentType); - // Let's try our own utilities (Jhove, etc.) to determine the file type - // of the uploaded file. (We may already have a mime type supplied for this - // file - maybe the type that the browser recognized on upload; or, if - // it's a harvest, maybe the remote server has already given us the type - // for this file... with our own type utility we may or may not do better - // than the type supplied: - // -- L.A. - String recognizedType = null; - - try { - recognizedType = determineFileType(tempFile.toFile(), fileName); - logger.fine("File utility recognized the file as " + recognizedType); - if (recognizedType != null && !recognizedType.equals("")) { - if(useRecognizedType(suppliedContentType, recognizedType)) { - finalType=recognizedType; - } - } - - } catch (Exception ex) { - logger.warning("Failed to run the file utility mime type check on file " + fileName); - } - - if (finalType == null) { - finalType = (suppliedContentType == null || suppliedContentType.equals("")) - ? MIME_TYPE_UNDETERMINED_DEFAULT - : suppliedContentType; - } - - // A few special cases: - - // if this is a gzipped FITS file, we'll uncompress it, and ingest it as - // a regular FITS file: - - if (finalType.equals("application/fits-gzipped")) { - - InputStream uncompressedIn = null; - String finalFileName = fileName; - // if the file name had the ".gz" extension, remove it, - // since we are going to uncompress it: - if (fileName != null && fileName.matches(".*\\.gz$")) { - finalFileName = fileName.replaceAll("\\.gz$", ""); - } - - DataFile datafile = null; - try { - uncompressedIn = new GZIPInputStream(new FileInputStream(tempFile.toFile())); - File unZippedTempFile = saveInputStreamInTempFile(uncompressedIn, fileSizeLimit); - datafile = createSingleDataFile(version, unZippedTempFile, finalFileName, MIME_TYPE_UNDETERMINED_DEFAULT, systemConfig.getFileFixityChecksumAlgorithm()); - } catch (IOException | FileExceedsMaxSizeException ioex) { - datafile = null; - } finally { - if (uncompressedIn != null) { - try {uncompressedIn.close();} catch (IOException e) {} - } - } - - // If we were able to produce an uncompressed file, we'll use it - // to create and return a final DataFile; if not, we're not going - // to do anything - and then a new DataFile will be created further - // down, from the original, uncompressed file. - if (datafile != null) { - // remove the compressed temp file: - try { - tempFile.toFile().delete(); - } catch (SecurityException ex) { - // (this is very non-fatal) - logger.warning("Failed to delete temporary file " + tempFile.toString()); - } - - datafiles.add(datafile); - return datafiles; - } - - // If it's a ZIP file, we are going to unpack it and create multiple - // DataFile objects from its contents: - } else if (finalType.equals("application/zip")) { - - ZipInputStream unZippedIn = null; - ZipEntry zipEntry = null; - - int fileNumberLimit = systemConfig.getZipUploadFilesLimit(); - - try { - Charset charset = null; - /* + Path tempFile = null; + + Long fileSizeLimit = systemConfig.getMaxFileUploadSizeForStore(version.getDataset().getEffectiveStorageDriverId()); + String finalType = null; + if (newStorageIdentifier == null) { + if (getFilesTempDirectory() != null) { + tempFile = Files.createTempFile(Paths.get(getFilesTempDirectory()), "tmp", "upload"); + // "temporary" location is the key here; this is why we are not using + // the DataStore framework for this - the assumption is that + // temp files will always be stored on the local filesystem. + // -- L.A. Jul. 2014 + logger.fine("Will attempt to save the file as: " + tempFile.toString()); + Files.copy(inputStream, tempFile, StandardCopyOption.REPLACE_EXISTING); + + // A file size check, before we do anything else: + // (note that "no size limit set" = "unlimited") + // (also note, that if this is a zip file, we'll be checking + // the size limit for each of the individual unpacked files) + Long fileSize = tempFile.toFile().length(); + if (fileSizeLimit != null && fileSize > fileSizeLimit) { + try { + tempFile.toFile().delete(); + } catch (Exception ex) { + } + throw new IOException(MessageFormat.format(BundleUtil.getStringFromBundle("file.addreplace.error.file_exceeds_limit"), bytesToHumanReadable(fileSize), bytesToHumanReadable(fileSizeLimit))); + } + + } else { + throw new IOException("Temp directory is not configured."); + } + logger.fine("mime type supplied: " + suppliedContentType); + // Let's try our own utilities (Jhove, etc.) to determine the file type + // of the uploaded file. (We may already have a mime type supplied for this + // file - maybe the type that the browser recognized on upload; or, if + // it's a harvest, maybe the remote server has already given us the type + // for this file... with our own type utility we may or may not do better + // than the type supplied: + // -- L.A. + String recognizedType = null; + + try { + recognizedType = determineFileType(tempFile.toFile(), fileName); + logger.fine("File utility recognized the file as " + recognizedType); + if (recognizedType != null && !recognizedType.equals("")) { + if (useRecognizedType(suppliedContentType, recognizedType)) { + finalType = recognizedType; + } + } + + } catch (Exception ex) { + logger.warning("Failed to run the file utility mime type check on file " + fileName); + } + + if (finalType == null) { + finalType = (suppliedContentType == null || suppliedContentType.equals("")) + ? MIME_TYPE_UNDETERMINED_DEFAULT + : suppliedContentType; + } + + // A few special cases: + // if this is a gzipped FITS file, we'll uncompress it, and ingest it as + // a regular FITS file: + if (finalType.equals("application/fits-gzipped")) { + + InputStream uncompressedIn = null; + String finalFileName = fileName; + // if the file name had the ".gz" extension, remove it, + // since we are going to uncompress it: + if (fileName != null && fileName.matches(".*\\.gz$")) { + finalFileName = fileName.replaceAll("\\.gz$", ""); + } + + DataFile datafile = null; + try { + uncompressedIn = new GZIPInputStream(new FileInputStream(tempFile.toFile())); + File unZippedTempFile = saveInputStreamInTempFile(uncompressedIn, fileSizeLimit); + datafile = createSingleDataFile(version, unZippedTempFile, finalFileName, MIME_TYPE_UNDETERMINED_DEFAULT, systemConfig.getFileFixityChecksumAlgorithm()); + } catch (IOException | FileExceedsMaxSizeException ioex) { + datafile = null; + } finally { + if (uncompressedIn != null) { + try { + uncompressedIn.close(); + } catch (IOException e) { + } + } + } + + // If we were able to produce an uncompressed file, we'll use it + // to create and return a final DataFile; if not, we're not going + // to do anything - and then a new DataFile will be created further + // down, from the original, uncompressed file. + if (datafile != null) { + // remove the compressed temp file: + try { + tempFile.toFile().delete(); + } catch (SecurityException ex) { + // (this is very non-fatal) + logger.warning("Failed to delete temporary file " + tempFile.toString()); + } + + datafiles.add(datafile); + return datafiles; + } + + // If it's a ZIP file, we are going to unpack it and create multiple + // DataFile objects from its contents: + } else if (finalType.equals("application/zip")) { + + ZipInputStream unZippedIn = null; + ZipEntry zipEntry = null; + + int fileNumberLimit = systemConfig.getZipUploadFilesLimit(); + + try { + Charset charset = null; + /* TODO: (?) We may want to investigate somehow letting the user specify the charset for the filenames in the zip file... @@ -853,126 +863,129 @@ public static List createDataFiles(DatasetVersion version, InputStream } } - */ - - if (charset != null) { - unZippedIn = new ZipInputStream(new FileInputStream(tempFile.toFile()), charset); - } else { - unZippedIn = new ZipInputStream(new FileInputStream(tempFile.toFile())); - } - - while (true) { - try { - zipEntry = unZippedIn.getNextEntry(); - } catch (IllegalArgumentException iaex) { - // Note: - // ZipInputStream documentation doesn't even mention that - // getNextEntry() throws an IllegalArgumentException! - // but that's what happens if the file name of the next - // entry is not valid in the current CharSet. - // -- L.A. - warningMessage = "Failed to unpack Zip file. (Unknown Character Set used in a file name?) Saving the file as is."; - logger.warning(warningMessage); - throw new IOException(); - } - - if (zipEntry == null) { - break; - } - // Note that some zip entries may be directories - we - // simply skip them: - - if (!zipEntry.isDirectory()) { - if (datafiles.size() > fileNumberLimit) { - logger.warning("Zip upload - too many files."); - warningMessage = "The number of files in the zip archive is over the limit (" + fileNumberLimit + - "); please upload a zip archive with fewer files, if you want them to be ingested " + - "as individual DataFiles."; - throw new IOException(); - } - - String fileEntryName = zipEntry.getName(); - logger.fine("ZipEntry, file: " + fileEntryName); - - if (fileEntryName != null && !fileEntryName.equals("")) { - - String shortName = fileEntryName.replaceFirst("^.*[\\/]", ""); - - // Check if it's a "fake" file - a zip archive entry - // created for a MacOS X filesystem element: (these - // start with "._") - if (!shortName.startsWith("._") && !shortName.startsWith(".DS_Store") && !"".equals(shortName)) { - // OK, this seems like an OK file entry - we'll try - // to read it and create a DataFile with it: - - File unZippedTempFile = saveInputStreamInTempFile(unZippedIn, fileSizeLimit); - DataFile datafile = createSingleDataFile(version, unZippedTempFile, null, shortName, - MIME_TYPE_UNDETERMINED_DEFAULT, - systemConfig.getFileFixityChecksumAlgorithm(), null, false); - - if (!fileEntryName.equals(shortName)) { - // If the filename looks like a hierarchical folder name (i.e., contains slashes and backslashes), - // we'll extract the directory name; then subject it to some "aggressive sanitizing" - strip all - // the leading, trailing and duplicate slashes; then replace all the characters that - // don't pass our validation rules. - String directoryName = fileEntryName.replaceFirst("[\\\\/][\\\\/]*[^\\\\/]*$", ""); - directoryName = StringUtil.sanitizeFileDirectory(directoryName, true); - // if (!"".equals(directoryName)) { - if (!StringUtil.isEmpty(directoryName)) { - logger.fine("setting the directory label to " + directoryName); - datafile.getFileMetadata().setDirectoryLabel(directoryName); - } - } - - if (datafile != null) { - // We have created this datafile with the mime type "unknown"; - // Now that we have it saved in a temporary location, - // let's try and determine its real type: - - String tempFileName = getFilesTempDirectory() + "/" + datafile.getStorageIdentifier(); - - try { - recognizedType = determineFileType(new File(tempFileName), shortName); - logger.fine("File utility recognized unzipped file as " + recognizedType); - if (recognizedType != null && !recognizedType.equals("")) { - datafile.setContentType(recognizedType); - } - } catch (Exception ex) { - logger.warning("Failed to run the file utility mime type check on file " + fileName); - } - - datafiles.add(datafile); - } - } - } - } - unZippedIn.closeEntry(); - - } - - } catch (IOException ioex) { - // just clear the datafiles list and let - // ingest default to creating a single DataFile out - // of the unzipped file. - logger.warning("Unzipping failed; rolling back to saving the file as is."); - if (warningMessage == null) { - warningMessage = "Failed to unzip the file. Saving the file as is."; - } - - datafiles.clear(); - } catch (FileExceedsMaxSizeException femsx) { - logger.warning("One of the unzipped files exceeds the size limit; resorting to saving the file as is. " + femsx.getMessage()); - warningMessage = femsx.getMessage() + "; saving the zip file as is, unzipped."; - datafiles.clear(); - } finally { - if (unZippedIn != null) { - try {unZippedIn.close();} catch (Exception zEx) {} - } - } - if (datafiles.size() > 0) { - // link the data files to the dataset/version: - // (except we no longer want to do this! -- 4.6) - /*Iterator itf = datafiles.iterator(); + */ + + if (charset != null) { + unZippedIn = new ZipInputStream(new FileInputStream(tempFile.toFile()), charset); + } else { + unZippedIn = new ZipInputStream(new FileInputStream(tempFile.toFile())); + } + + while (true) { + try { + zipEntry = unZippedIn.getNextEntry(); + } catch (IllegalArgumentException iaex) { + // Note: + // ZipInputStream documentation doesn't even mention that + // getNextEntry() throws an IllegalArgumentException! + // but that's what happens if the file name of the next + // entry is not valid in the current CharSet. + // -- L.A. + warningMessage = "Failed to unpack Zip file. (Unknown Character Set used in a file name?) Saving the file as is."; + logger.warning(warningMessage); + throw new IOException(); + } + + if (zipEntry == null) { + break; + } + // Note that some zip entries may be directories - we + // simply skip them: + + if (!zipEntry.isDirectory()) { + if (datafiles.size() > fileNumberLimit) { + logger.warning("Zip upload - too many files."); + warningMessage = "The number of files in the zip archive is over the limit (" + fileNumberLimit + + "); please upload a zip archive with fewer files, if you want them to be ingested " + + "as individual DataFiles."; + throw new IOException(); + } + + String fileEntryName = zipEntry.getName(); + logger.fine("ZipEntry, file: " + fileEntryName); + + if (fileEntryName != null && !fileEntryName.equals("")) { + + String shortName = fileEntryName.replaceFirst("^.*[\\/]", ""); + + // Check if it's a "fake" file - a zip archive entry + // created for a MacOS X filesystem element: (these + // start with "._") + if (!shortName.startsWith("._") && !shortName.startsWith(".DS_Store") && !"".equals(shortName)) { + // OK, this seems like an OK file entry - we'll try + // to read it and create a DataFile with it: + + File unZippedTempFile = saveInputStreamInTempFile(unZippedIn, fileSizeLimit); + DataFile datafile = createSingleDataFile(version, unZippedTempFile, null, shortName, + MIME_TYPE_UNDETERMINED_DEFAULT, + systemConfig.getFileFixityChecksumAlgorithm(), null, false); + + if (!fileEntryName.equals(shortName)) { + // If the filename looks like a hierarchical folder name (i.e., contains slashes and backslashes), + // we'll extract the directory name; then subject it to some "aggressive sanitizing" - strip all + // the leading, trailing and duplicate slashes; then replace all the characters that + // don't pass our validation rules. + String directoryName = fileEntryName.replaceFirst("[\\\\/][\\\\/]*[^\\\\/]*$", ""); + directoryName = StringUtil.sanitizeFileDirectory(directoryName, true); + // if (!"".equals(directoryName)) { + if (!StringUtil.isEmpty(directoryName)) { + logger.fine("setting the directory label to " + directoryName); + datafile.getFileMetadata().setDirectoryLabel(directoryName); + } + } + + if (datafile != null) { + // We have created this datafile with the mime type "unknown"; + // Now that we have it saved in a temporary location, + // let's try and determine its real type: + + String tempFileName = getFilesTempDirectory() + "/" + datafile.getStorageIdentifier(); + + try { + recognizedType = determineFileType(new File(tempFileName), shortName); + logger.fine("File utility recognized unzipped file as " + recognizedType); + if (recognizedType != null && !recognizedType.equals("")) { + datafile.setContentType(recognizedType); + } + } catch (Exception ex) { + logger.warning("Failed to run the file utility mime type check on file " + fileName); + } + + datafiles.add(datafile); + } + } + } + } + unZippedIn.closeEntry(); + + } + + } catch (IOException ioex) { + // just clear the datafiles list and let + // ingest default to creating a single DataFile out + // of the unzipped file. + logger.warning("Unzipping failed; rolling back to saving the file as is."); + if (warningMessage == null) { + warningMessage = "Failed to unzip the file. Saving the file as is."; + } + + datafiles.clear(); + } catch (FileExceedsMaxSizeException femsx) { + logger.warning("One of the unzipped files exceeds the size limit; resorting to saving the file as is. " + femsx.getMessage()); + warningMessage = femsx.getMessage() + "; saving the zip file as is, unzipped."; + datafiles.clear(); + } finally { + if (unZippedIn != null) { + try { + unZippedIn.close(); + } catch (Exception zEx) { + } + } + } + if (datafiles.size() > 0) { + // link the data files to the dataset/version: + // (except we no longer want to do this! -- 4.6) + /*Iterator itf = datafiles.iterator(); while (itf.hasNext()) { DataFile datafile = itf.next(); datafile.setOwner(version.getDataset()); @@ -984,125 +997,138 @@ public static List createDataFiles(DatasetVersion version, InputStream version.getDataset().getFiles().add(datafile); } */ - // remove the uploaded zip file: - try { - Files.delete(tempFile); - } catch (IOException ioex) { - // do nothing - it's just a temp file. - logger.warning("Could not remove temp file " + tempFile.getFileName().toString()); - } - // and return: - return datafiles; - } - - } else if (finalType.equalsIgnoreCase(ShapefileHandler.SHAPEFILE_FILE_TYPE)) { - // Shape files may have to be split into multiple files, - // one zip archive per each complete set of shape files: - - // File rezipFolder = new File(this.getFilesTempDirectory()); - File rezipFolder = getShapefileUnzipTempDirectory(); - - IngestServiceShapefileHelper shpIngestHelper; - shpIngestHelper = new IngestServiceShapefileHelper(tempFile.toFile(), rezipFolder); - - boolean didProcessWork = shpIngestHelper.processFile(); - if (!(didProcessWork)) { - logger.severe("Processing of zipped shapefile failed."); - return null; - } - - try { - for (File finalFile : shpIngestHelper.getFinalRezippedFiles()) { - FileInputStream finalFileInputStream = new FileInputStream(finalFile); - finalType = determineContentType(finalFile); - if (finalType == null) { - logger.warning("Content type is null; but should default to 'MIME_TYPE_UNDETERMINED_DEFAULT'"); - continue; - } - - File unZippedShapeTempFile = saveInputStreamInTempFile(finalFileInputStream, fileSizeLimit); - DataFile new_datafile = createSingleDataFile(version, unZippedShapeTempFile, finalFile.getName(), finalType, systemConfig.getFileFixityChecksumAlgorithm()); - if (new_datafile != null) { - datafiles.add(new_datafile); - } else { - logger.severe("Could not add part of rezipped shapefile. new_datafile was null: " + finalFile.getName()); - } - finalFileInputStream.close(); - - } - } catch (FileExceedsMaxSizeException femsx) { - logger.severe("One of the unzipped shape files exceeded the size limit; giving up. " + femsx.getMessage()); - datafiles.clear(); - } - - // Delete the temp directory used for unzipping - // The try-catch is due to error encountered in using NFS for stocking file, - // cf. https://github.com/IQSS/dataverse/issues/5909 - try { - FileUtils.deleteDirectory(rezipFolder); - } catch (IOException ioex) { - // do nothing - it's a tempo folder. - logger.warning("Could not remove temp folder, error message : " + ioex.getMessage()); - } - - if (datafiles.size() > 0) { - // remove the uploaded zip file: - try { - Files.delete(tempFile); - } catch (IOException ioex) { - // do nothing - it's just a temp file. - logger.warning("Could not remove temp file " + tempFile.getFileName().toString()); - } catch (SecurityException se) { - logger.warning("Unable to delete: " + tempFile.toString() + "due to Security Exception: " - + se.getMessage()); - } - return datafiles; - } else { - logger.severe("No files added from directory of rezipped shapefiles"); - } - return null; - - } - } else { - // Default to suppliedContentType if set or the overall undetermined default if a contenttype isn't supplied - finalType = StringUtils.isBlank(suppliedContentType) ? FileUtil.MIME_TYPE_UNDETERMINED_DEFAULT : suppliedContentType; - String type = determineFileTypeByExtension(fileName); - if (!StringUtils.isBlank(type)) { - //Use rules for deciding when to trust browser supplied type - if (useRecognizedType(finalType, type)) { - finalType = type; - } - logger.fine("Supplied type: " + suppliedContentType + ", finalType: " + finalType); - } - } + // remove the uploaded zip file: + try { + Files.delete(tempFile); + } catch (IOException ioex) { + // do nothing - it's just a temp file. + logger.warning("Could not remove temp file " + tempFile.getFileName().toString()); + } + // and return: + return datafiles; + } + + } else if (finalType.equalsIgnoreCase(ShapefileHandler.SHAPEFILE_FILE_TYPE)) { + // Shape files may have to be split into multiple files, + // one zip archive per each complete set of shape files: + + // File rezipFolder = new File(this.getFilesTempDirectory()); + File rezipFolder = getShapefileUnzipTempDirectory(); + + IngestServiceShapefileHelper shpIngestHelper; + shpIngestHelper = new IngestServiceShapefileHelper(tempFile.toFile(), rezipFolder); + + boolean didProcessWork = shpIngestHelper.processFile(); + if (!(didProcessWork)) { + logger.severe("Processing of zipped shapefile failed."); + return null; + } + + try { + for (File finalFile : shpIngestHelper.getFinalRezippedFiles()) { + FileInputStream finalFileInputStream = new FileInputStream(finalFile); + finalType = determineContentType(finalFile); + if (finalType == null) { + logger.warning("Content type is null; but should default to 'MIME_TYPE_UNDETERMINED_DEFAULT'"); + continue; + } + + File unZippedShapeTempFile = saveInputStreamInTempFile(finalFileInputStream, fileSizeLimit); + DataFile new_datafile = createSingleDataFile(version, unZippedShapeTempFile, finalFile.getName(), finalType, systemConfig.getFileFixityChecksumAlgorithm()); + String directoryName = null; + String absolutePathName = finalFile.getParent(); + if (absolutePathName != null) { + if (absolutePathName.length() > rezipFolder.toString().length()) { + // This file lives in a subfolder - we want to + // preserve it in the FileMetadata: + directoryName = absolutePathName.substring(rezipFolder.toString().length() + 1); + + if (!StringUtil.isEmpty(directoryName)) { + new_datafile.getFileMetadata().setDirectoryLabel(directoryName); + } + } + } + if (new_datafile != null) { + datafiles.add(new_datafile); + } else { + logger.severe("Could not add part of rezipped shapefile. new_datafile was null: " + finalFile.getName()); + } + finalFileInputStream.close(); + + } + } catch (FileExceedsMaxSizeException femsx) { + logger.severe("One of the unzipped shape files exceeded the size limit; giving up. " + femsx.getMessage()); + datafiles.clear(); + } + + // Delete the temp directory used for unzipping + // The try-catch is due to error encountered in using NFS for stocking file, + // cf. https://github.com/IQSS/dataverse/issues/5909 + try { + FileUtils.deleteDirectory(rezipFolder); + } catch (IOException ioex) { + // do nothing - it's a tempo folder. + logger.warning("Could not remove temp folder, error message : " + ioex.getMessage()); + } + + if (datafiles.size() > 0) { + // remove the uploaded zip file: + try { + Files.delete(tempFile); + } catch (IOException ioex) { + // do nothing - it's just a temp file. + logger.warning("Could not remove temp file " + tempFile.getFileName().toString()); + } catch (SecurityException se) { + logger.warning("Unable to delete: " + tempFile.toString() + "due to Security Exception: " + + se.getMessage()); + } + return datafiles; + } else { + logger.severe("No files added from directory of rezipped shapefiles"); + } + return null; + + } + } else { + // Default to suppliedContentType if set or the overall undetermined default if a contenttype isn't supplied + finalType = StringUtils.isBlank(suppliedContentType) ? FileUtil.MIME_TYPE_UNDETERMINED_DEFAULT : suppliedContentType; + String type = determineFileTypeByExtension(fileName); + if (!StringUtils.isBlank(type)) { + //Use rules for deciding when to trust browser supplied type + if (useRecognizedType(finalType, type)) { + finalType = type; + } + logger.fine("Supplied type: " + suppliedContentType + ", finalType: " + finalType); + } + } // Finally, if none of the special cases above were applicable (or // if we were unable to unpack an uploaded file, etc.), we'll just // create and return a single DataFile: File newFile = null; - if(tempFile!=null) { - newFile = tempFile.toFile(); + if (tempFile != null) { + newFile = tempFile.toFile(); } ChecksumType checkSumType = DataFile.ChecksumType.MD5; - if(newStorageIdentifier==null) { - checkSumType=systemConfig.getFileFixityChecksumAlgorithm(); + if (newStorageIdentifier == null) { + checkSumType = systemConfig.getFileFixityChecksumAlgorithm(); } - + DataFile datafile = createSingleDataFile(version, newFile, newStorageIdentifier, fileName, finalType, checkSumType, newCheckSum); File f = null; - if(tempFile!=null) { - f=tempFile.toFile(); + if (tempFile != null) { + f = tempFile.toFile(); } - if (datafile != null && ((f != null) || (newStorageIdentifier!=null))) { - + if (datafile != null && ((f != null) || (newStorageIdentifier != null))) { + if (warningMessage != null) { createIngestFailureReport(datafile, warningMessage); datafile.SetIngestProblem(); } datafiles.add(datafile); - + return datafiles; } - + return null; } // end createDataFiles @@ -1333,7 +1359,7 @@ public static String getFilesTempDirectory() { } public static void generateS3PackageStorageIdentifier(DataFile dataFile) { - String driverId = dataFile.getDataverseContext().getEffectiveStorageDriverId(); + String driverId = dataFile.getOwner().getEffectiveStorageDriverId(); String bucketName = System.getProperty("dataverse.files." + driverId + ".bucket-name"); String storageId = driverId + "://" + bucketName + ":" + dataFile.getFileMetadata().getLabel(); @@ -1679,7 +1705,7 @@ public static boolean isPackageFile(DataFile dataFile) { } public static S3AccessIO getS3AccessForDirectUpload(Dataset dataset) { - String driverId = dataset.getDataverseContext().getEffectiveStorageDriverId(); + String driverId = dataset.getEffectiveStorageDriverId(); boolean directEnabled = Boolean.getBoolean("dataverse.files." + driverId + ".upload-redirect"); //Should only be requested when it is allowed, but we'll log a warning otherwise if(!directEnabled) { diff --git a/src/main/java/edu/harvard/iq/dataverse/util/ShapefileHandler.java b/src/main/java/edu/harvard/iq/dataverse/util/ShapefileHandler.java index e147f5fed0b..3af562882f3 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/ShapefileHandler.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/ShapefileHandler.java @@ -273,14 +273,25 @@ private String getFileBasename(String fileName){ } return unzipFileName; } - /* - Unzip the files to the directory, FLATTENING the directory structure - - Any colliding names will result in overwrites + private String getFolderName(String fileName){ + if (fileName==null){ + return null; + } + return new File(fileName).getParent(); + } + /* + We used to unzip the files to the directory, FLATTENING the directory structure + Any colliding names would result in overwrites + HOWEVER, starting with v5.1, we are now preserving the folder structure + inside the uploaded zip file (issue #6873). To achieve this, we recreate + all the folders as they appear in the original zip archive, and as we + rezip any found shape file sets. The FileUtil then preserve any such + subfolders in the FileMetadata of the newly created DataFiles. + (-- L.A. 09/2020) */ private boolean unzipFilesToDirectory(FileInputStream zipfile_input_stream, File target_directory){ - //logger.info("unzipFilesToDirectory: " + target_directory.getAbsolutePath() ); + logger.fine("unzipFilesToDirectory: " + target_directory.getAbsolutePath() ); if (zipfile_input_stream== null){ this.addErrorMessage("unzipFilesToDirectory. The zipfile_input_stream is null."); @@ -301,7 +312,7 @@ private boolean unzipFilesToDirectory(FileInputStream zipfile_input_stream, File while((origEntry = zipStream.getNextEntry())!=null){ String zentryFileName = origEntry.getName(); - //logger.info("\nOriginal entry name: " + origEntry); + logger.fine("\nOriginal entry name: " + origEntry); if (this.isFileToSkip(zentryFileName)){ logger.fine("Skip file"); @@ -312,20 +323,35 @@ private boolean unzipFilesToDirectory(FileInputStream zipfile_input_stream, File if (origEntry.isDirectory()) { //logger.info("Subdirectory found!"); logger.fine("Skip directory"); - //String dirpath = target_directory.getAbsolutePath() + "/" + zentryFileName; - //createDirectory(dirpath); + String dirpath = target_directory.getAbsolutePath() + "/" + zentryFileName; + createDirectory(dirpath); continue; // Continue to next Entry } logger.fine("file found!"); // Write the file String unzipFileName = this.getFileBasename(zentryFileName); + String unzipFolderName = this.getFolderName(zentryFileName); + + String unzipFilePath = unzipFileName; + if (unzipFolderName != null) { + unzipFilePath = unzipFolderName + "/" + unzipFileName; + + // There's a chance we haven't created this folder yet + // in the destination directory (this happens if the folder + // is not explicitly listed in the Zip archive directory). + String dirpath = target_directory.getAbsolutePath() + "/" + unzipFolderName; + // (and if it already exists, it'll be skipped) + createDirectory(dirpath); + } + if (unzipFileName==null){ logger.warning("Zip Entry Basename is an empty string: " + zentryFileName); continue; } - String outpath = target_directory.getAbsolutePath() + "/" + unzipFileName; + //String outpath = target_directory.getAbsolutePath() + "/" + unzipFileName; + String outpath = target_directory.getAbsolutePath() + "/" + unzipFilePath; if (unzippedFileNames.contains(outpath)){ logger.info("Potential name collision. Avoiding duplicate files in 'collapsed' zip directories. Skipping file: " + zentryFileName); continue; @@ -493,6 +519,8 @@ private boolean redistributeFilesFromZip(String source_dirname, String target_di //this.msg("source_dirname: "+ source_dirname); //msgt("create zipped shapefile"); + // Make sure the parent folder(s) are there: + createDirectory(new File(target_zipfile_name).getParentFile()); ZipMaker zip_maker = new ZipMaker(namesToZip, source_dirname, target_zipfile_name); this.addFinalRezippedFile(target_zipfile_name); @@ -526,6 +554,11 @@ private boolean straightFileCopy(String sourceFileName, String targetFileName){ File source_file = new File(sourceFileName); File target_file = new File(targetFileName); + + if (target_file.getParentFile() != null) { + // Make sure the parent folder(s) are there: + createDirectory(target_file.getParentFile()); + } try { Files.copy(source_file.toPath(), target_file.toPath(), REPLACE_EXISTING); } catch (IOException ex) { @@ -681,22 +714,28 @@ private boolean examineZipfile(FileInputStream zip_file_stream){ //createDirectory(dirpath); continue; } - + String unzipFileName = this.getFileBasename(zentryFileName); if (unzipFileName==null){ logger.warning("Zip Entry Basename is an empty string: " + zentryFileName); continue; } + String unzipFolderName = this.getFolderName(zentryFileName); + + String unzipFilePath = unzipFileName; + if (unzipFolderName != null) { + unzipFilePath = unzipFolderName + "/" + unzipFileName; + } String s = String.format("Entry: %s len %d added %TD", - unzipFileName, entry.getSize(), + unzipFilePath, entry.getSize(), new Date(entry.getTime())); if (!this.filesListInDir.contains(s)){ this.filesListInDir.add(s); - updateFileGroupHash(unzipFileName); - this.filesizeHash.put(unzipFileName, entry.getSize()); + updateFileGroupHash(unzipFilePath); + this.filesizeHash.put(unzipFilePath, entry.getSize()); } } // end while diff --git a/src/main/java/edu/harvard/iq/dataverse/util/SystemConfig.java b/src/main/java/edu/harvard/iq/dataverse/util/SystemConfig.java index d98dfa8ab34..332aaa0326f 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/SystemConfig.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/SystemConfig.java @@ -4,6 +4,7 @@ import edu.harvard.iq.dataverse.DataFile; import edu.harvard.iq.dataverse.Dataset; import edu.harvard.iq.dataverse.DataverseServiceBean; +import edu.harvard.iq.dataverse.DvObjectContainer; import edu.harvard.iq.dataverse.authorization.AuthenticationServiceBean; import edu.harvard.iq.dataverse.authorization.providers.builtin.BuiltinAuthenticationProvider; import edu.harvard.iq.dataverse.authorization.providers.oauth2.AbstractOAuth2AuthenticationProvider; @@ -1070,8 +1071,8 @@ public boolean isDatafileValidationOnPublishEnabled() { return settingsService.isTrueForKey(SettingsServiceBean.Key.FileValidationOnPublishEnabled, safeDefaultIfKeyNotFound); } - public boolean directUploadEnabled(Dataset dataset) { - return Boolean.getBoolean("dataverse.files." + dataset.getDataverseContext().getEffectiveStorageDriverId() + ".upload-redirect"); + public boolean directUploadEnabled(DvObjectContainer container) { + return Boolean.getBoolean("dataverse.files." + container.getEffectiveStorageDriverId() + ".upload-redirect"); } public String getDataCiteRestApiUrlString() { diff --git a/src/main/java/propertyFiles/biomedical.properties b/src/main/java/propertyFiles/biomedical.properties index c3fd3f81bc7..723a4ac2f40 100644 --- a/src/main/java/propertyFiles/biomedical.properties +++ b/src/main/java/propertyFiles/biomedical.properties @@ -1,31 +1,43 @@ metadatablock.name=biomedical metadatablock.displayName=Life Sciences Metadata datasetfieldtype.studyDesignType.title=Design Type +datasetfieldtype.studyOtherDesignType.title=Other Design Type datasetfieldtype.studyFactorType.title=Factor Type +datasetfieldtype.studyOtherFactorType.title=Other Factor Type datasetfieldtype.studyAssayOrganism.title=Organism datasetfieldtype.studyAssayOtherOrganism.title=Other Organism datasetfieldtype.studyAssayMeasurementType.title=Measurement Type datasetfieldtype.studyAssayOtherMeasurmentType.title=Other Measurement Type datasetfieldtype.studyAssayTechnologyType.title=Technology Type +datasetfieldtype.studyAssayOtherTechnologyType.title=Other Technology Type datasetfieldtype.studyAssayPlatform.title=Technology Platform +datasetfieldtype.studyAssayOtherPlatform.title=Other Technology Platform datasetfieldtype.studyAssayCellType.title=Cell Type datasetfieldtype.studyDesignType.description=Design types that are based on the overall experimental design. +datasetfieldtype.studyOtherDesignType.description=If Other was selected in Design Type, list any other design types that were used in this Dataset. datasetfieldtype.studyFactorType.description=Factors used in the Dataset. +datasetfieldtype.studyOtherFactorType.description=If Other was selected in Factor Type, list any other factor types that were used in this Dataset. datasetfieldtype.studyAssayOrganism.description=The taxonomic name of the organism used in the Dataset or from which the starting biological material derives. datasetfieldtype.studyAssayOtherOrganism.description=If Other was selected in Organism, list any other organisms that were used in this Dataset. Terms from the NCBI Taxonomy are recommended. datasetfieldtype.studyAssayMeasurementType.description=A term to qualify the endpoint, or what is being measured (e.g. gene expression profiling; protein identification). datasetfieldtype.studyAssayOtherMeasurmentType.description=If Other was selected in Measurement Type, list any other measurement types that were used. Terms from NCBO Bioportal are recommended. datasetfieldtype.studyAssayTechnologyType.description=A term to identify the technology used to perform the measurement (e.g. DNA microarray; mass spectrometry). +datasetfieldtype.studyAssayOtherTechnologyType.description=If Other was selected in Technology Type, list any other technology types that were used in this Dataset. datasetfieldtype.studyAssayPlatform.description=The manufacturer and name of the technology platform used in the assay (e.g. Bruker AVANCE). +datasetfieldtype.studyAssayOtherPlatform.description=If Other was selected in Technology Platform, list any other technology platforms that were used in this Dataset. datasetfieldtype.studyAssayCellType.description=The name of the cell line from which the source or sample derives. datasetfieldtype.studyDesignType.watermark= +datasetfieldtype.studyOtherDesignType.watermark= datasetfieldtype.studyFactorType.watermark= +datasetfieldtype.studyOtherFactorType.watermark= datasetfieldtype.studyAssayOrganism.watermark= datasetfieldtype.studyAssayOtherOrganism.watermark= datasetfieldtype.studyAssayMeasurementType.watermark= datasetfieldtype.studyAssayOtherMeasurmentType.watermark= datasetfieldtype.studyAssayTechnologyType.watermark= +datasetfieldtype.studyAssayOtherTechnologyType.watermark= datasetfieldtype.studyAssayPlatform.watermark= +datasetfieldtype.studyAssayOtherPlatform.watermark= datasetfieldtype.studyAssayCellType.watermark= controlledvocabulary.studyDesignType.case_control=Case Control controlledvocabulary.studyDesignType.cross_sectional=Cross Sectional @@ -36,6 +48,7 @@ controlledvocabulary.studyDesignType.parallel_group_design=Parallel Group Design controlledvocabulary.studyDesignType.perturbation_design=Perturbation Design controlledvocabulary.studyDesignType.randomized_controlled_trial=Randomized Controlled Trial controlledvocabulary.studyDesignType.technological_design=Technological Design +controlledvocabulary.studyDesignType.other=Other controlledvocabulary.studyFactorType.age=Age controlledvocabulary.studyFactorType.biomarkers=Biomarkers controlledvocabulary.studyFactorType.cell_surface_markers=Cell Surface Markers @@ -308,4 +321,4 @@ controlledvocabulary.studyAssayPlatform.xevo_g2_qtof_(waters)=Xevo G2 QTOF (Wate controlledvocabulary.studyAssayPlatform.xevo_qtof_ms_(waters)=Xevo QTof MS (Waters) controlledvocabulary.studyAssayPlatform.xevo_tq_ms_(waters)=Xevo TQ MS (Waters) controlledvocabulary.studyAssayPlatform.xevo_tq-s_(waters)=Xevo TQ-S (Waters) -controlledvocabulary.studyAssayPlatform.other=Other \ No newline at end of file +controlledvocabulary.studyAssayPlatform.other=Other diff --git a/src/main/resources/db/migration/V5.0.0.1__6872-assign-storage-drivers-to-datasets.sql b/src/main/resources/db/migration/V5.0.0.1__6872-assign-storage-drivers-to-datasets.sql new file mode 100644 index 00000000000..453b2054c43 --- /dev/null +++ b/src/main/resources/db/migration/V5.0.0.1__6872-assign-storage-drivers-to-datasets.sql @@ -0,0 +1 @@ +ALTER TABLE dataset ADD COLUMN IF NOT EXISTS storagedriver VARCHAR(255); \ No newline at end of file diff --git a/src/main/webapp/dataverse.xhtml b/src/main/webapp/dataverse.xhtml index 0c97b1a0ce2..fbbe7563baf 100644 --- a/src/main/webapp/dataverse.xhtml +++ b/src/main/webapp/dataverse.xhtml @@ -19,6 +19,7 @@ + @@ -43,7 +44,7 @@ - + @@ -95,7 +96,7 @@ @@ -110,7 +111,10 @@ data-toggle="tooltip" data-placement="auto right" data-original-title="#{bundle['dataverse.name.title']}">
- + +
@@ -135,7 +139,10 @@
#{systemConfig.dataverseSiteUrl}/dataverse/
- +
@@ -164,7 +171,8 @@ data-toggle="tooltip" data-placement="auto right" data-original-title="#{bundle['dataverse.category.title']}">
- + @@ -191,7 +199,8 @@
- + @@ -385,7 +394,7 @@
- + @@ -494,7 +503,7 @@
  • - +
  • diff --git a/src/main/webapp/search-include-fragment.xhtml b/src/main/webapp/search-include-fragment.xhtml index f3a4220bed6..3260495c4e7 100644 --- a/src/main/webapp/search-include-fragment.xhtml +++ b/src/main/webapp/search-include-fragment.xhtml @@ -232,7 +232,7 @@
    -
    +

    @@ -448,7 +448,7 @@ - + @@ -461,7 +461,7 @@ - + @@ -519,7 +519,7 @@ - + @@ -586,7 +586,7 @@ - + diff --git a/src/test/java/edu/harvard/iq/dataverse/api/DownloadFilesIT.java b/src/test/java/edu/harvard/iq/dataverse/api/DownloadFilesIT.java index 908beeac941..dde79574b87 100644 --- a/src/test/java/edu/harvard/iq/dataverse/api/DownloadFilesIT.java +++ b/src/test/java/edu/harvard/iq/dataverse/api/DownloadFilesIT.java @@ -350,7 +350,7 @@ public void downloadAllFilesTabular() throws IOException { .body("data.files[0].label", equalTo("50by1000.dta")); // UtilIT.MAXIMUM_INGEST_LOCK_DURATION is 3 but not long enough. - assertTrue("Failed test if Ingest Lock exceeds max duration " + pathToFile, UtilIT.sleepForLock(datasetId.longValue(), "Ingest", apiToken, 4)); + assertTrue("Failed test if Ingest Lock exceeds max duration " + pathToFile, UtilIT.sleepForLock(datasetId.longValue(), "Ingest", apiToken, UtilIT.MAXIMUM_INGEST_LOCK_DURATION + 3)); Response downloadFiles1 = UtilIT.downloadFiles(datasetPid, apiToken); downloadFiles1.then().assertThat() diff --git a/src/test/java/edu/harvard/iq/dataverse/api/FilesIT.java b/src/test/java/edu/harvard/iq/dataverse/api/FilesIT.java index 4d8c6e2a8d9..a538cb54f59 100644 --- a/src/test/java/edu/harvard/iq/dataverse/api/FilesIT.java +++ b/src/test/java/edu/harvard/iq/dataverse/api/FilesIT.java @@ -1413,6 +1413,62 @@ public void testValidateDDI_issue6027() throws InterruptedException { } + /* + A very simple test for shape file package processing. + */ + @Test + public void test_ProcessShapeFilePackage() { + msgt("test_ProcessShapeFilePackage"); + // Create user + String apiToken = createUserGetToken(); + + // Create Dataverse + String dataverseAlias = createDataverseGetAlias(apiToken); + + // Create Dataset + Integer datasetId = createDatasetGetId(dataverseAlias, apiToken); + + // This archive contains 4 files that constitute a valid + // shape file. We want to check that these files were properly + // recognized and re-zipped as a shape package, preserving the + // folder structure found in the uploaded zip. + String pathToFile = "scripts/search/data/shape/shapefile.zip"; + + String suppliedDescription = "file extracted from a shape bundle"; + String extractedFolderName = "subfolder"; + String extractedShapeName = "boston_public_schools_2012_z1l.zip"; + String extractedShapeType = "application/zipped-shapefile"; + + JsonObjectBuilder json = Json.createObjectBuilder() + .add("description", suppliedDescription); + + Response addResponse = UtilIT.uploadFileViaNative(datasetId.toString(), pathToFile, json.build(), apiToken); + + msgt("Server response: " + addResponse.prettyPrint()); + + // We are checking the following: + // - that the upload succeeded; + // - that a shape file with the name specified above has been repackaged and added + // to the dataset as a single file; + // - that the mime type has been properly identified; + // - that the description supplied via the API has been added; + // - that the subfolder found inside the uploaded zip file has been properly + // preserved in the FileMetadata. + // + // Feel free to expand the checks further - we can also verify the + // checksum, the size of the resulting file, add more files to the uploaded + // zip archive etc. etc. - but this should be a good start. + // -- L.A. 2020/09 + addResponse.then().assertThat() + .body("status", equalTo(AbstractApiBean.STATUS_OK)) + .body("data.files[0].dataFile.contentType", equalTo(extractedShapeType)) + .body("data.files[0].label", equalTo(extractedShapeName)) + .body("data.files[0].directoryLabel", equalTo(extractedFolderName)) + .body("data.files[0].description", equalTo(suppliedDescription)) + .statusCode(OK.getStatusCode()); + } + + private void msg(String m){ System.out.println(m); } diff --git a/src/test/java/edu/harvard/iq/dataverse/api/IndexIT.java b/src/test/java/edu/harvard/iq/dataverse/api/IndexIT.java new file mode 100644 index 00000000000..40d1488ffc0 --- /dev/null +++ b/src/test/java/edu/harvard/iq/dataverse/api/IndexIT.java @@ -0,0 +1,115 @@ +package edu.harvard.iq.dataverse.api; + +import com.jayway.restassured.RestAssured; +import static com.jayway.restassured.RestAssured.given; +import com.jayway.restassured.path.json.JsonPath; +import com.jayway.restassured.response.Response; +import static edu.harvard.iq.dataverse.api.UtilIT.API_TOKEN_HTTP_HEADER; +import edu.harvard.iq.dataverse.settings.SettingsServiceBean; +import java.util.ArrayList; +import java.util.logging.Logger; +import static javax.ws.rs.core.Response.Status.CREATED; +import static javax.ws.rs.core.Response.Status.NO_CONTENT; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; +import static javax.ws.rs.core.Response.Status.OK; +import static junit.framework.Assert.assertEquals; +import org.hamcrest.CoreMatchers; +import static org.hamcrest.CoreMatchers.equalTo; +import org.junit.After; + +public class IndexIT { + + private static final Logger logger = Logger.getLogger(IndexIT.class.getCanonicalName()); + + @BeforeClass + public static void setUpClass() { + + RestAssured.baseURI = UtilIT.getRestAssuredBaseUri(); + + Response makeSureTokenlessSearchIsEnabled = UtilIT.deleteSetting(SettingsServiceBean.Key.SearchApiRequiresToken); + makeSureTokenlessSearchIsEnabled.then().assertThat() + .statusCode(OK.getStatusCode()); + + Response remove = UtilIT.deleteSetting(SettingsServiceBean.Key.ThumbnailSizeLimitImage); + remove.then().assertThat() + .statusCode(200); + + } + + + @Test + public void testIndexStatus() { + + Response createUser = UtilIT.createRandomUser(); + createUser.prettyPrint(); + String username = UtilIT.getUsernameFromResponse(createUser); + String apiToken = UtilIT.getApiTokenFromResponse(createUser); + + Response createDataverseResponse = UtilIT.createRandomDataverse(apiToken); + createDataverseResponse.prettyPrint(); + String dataverseAlias = UtilIT.getAliasFromResponse(createDataverseResponse); + + Response createDatasetResponse = UtilIT.createRandomDatasetViaNativeApi(dataverseAlias, apiToken); + createDatasetResponse.prettyPrint(); + Integer datasetId = UtilIT.getDatasetIdFromResponse(createDatasetResponse); + Response getDatasetJsonNoFiles = UtilIT.nativeGet(datasetId, apiToken); + getDatasetJsonNoFiles.prettyPrint(); + String protocol1 = JsonPath.from(getDatasetJsonNoFiles.getBody().asString()).getString("data.protocol"); + String authority1 = JsonPath.from(getDatasetJsonNoFiles.getBody().asString()).getString("data.authority"); + String identifier1 = JsonPath.from(getDatasetJsonNoFiles.getBody().asString()).getString("data.identifier"); + String dataset1PersistentId = protocol1 + ":" + authority1 + "/" + identifier1; + + Response uploadMd5File = UtilIT.uploadRandomFile(dataset1PersistentId, apiToken); + uploadMd5File.prettyPrint(); + assertEquals(CREATED.getStatusCode(), uploadMd5File.getStatusCode()); + + Response response = given() + .header(API_TOKEN_HTTP_HEADER, apiToken) + .queryParam("sync","true") + .get("/api/admin/index/status"); + response.prettyPrint(); + ArrayList emptyList = new ArrayList<>(); + response.then().assertThat().statusCode(OK.getStatusCode()) + .body("data.contentInDatabaseButStaleInOrMissingFromIndex.dataverses", CoreMatchers.equalTo(emptyList)) + .body("data.contentInDatabaseButStaleInOrMissingFromIndex.datasets", CoreMatchers.equalTo(emptyList)) + .body("data.contentInIndexButNotDatabase.dataverses", CoreMatchers.equalTo(emptyList)) + .body("data.contentInIndexButNotDatabase.datasets", CoreMatchers.equalTo(emptyList)) + .body("data.contentInIndexButNotDatabase.files", CoreMatchers.equalTo(emptyList)) + .body("data.permissionsInDatabaseButStaleInOrMissingFromIndex.dvobjects", CoreMatchers.equalTo(emptyList)) + .body("data.permissionsInIndexButNotDatabase.permissions", CoreMatchers.equalTo(emptyList)); + + Response getDatasetJsonAfterMd5File = UtilIT.nativeGet(datasetId, apiToken); + getDatasetJsonAfterMd5File.prettyPrint(); + getDatasetJsonAfterMd5File.then().assertThat() + .body("data.latestVersion.files[0].dataFile.md5", equalTo("0386269a5acb2c57b4eade587ff4db64")) + .body("data.latestVersion.files[0].dataFile.checksum.type", equalTo("MD5")) + .body("data.latestVersion.files[0].dataFile.checksum.value", equalTo("0386269a5acb2c57b4eade587ff4db64")); + + int fileId = JsonPath.from(getDatasetJsonAfterMd5File.getBody().asString()).getInt("data.latestVersion.files[0].dataFile.id"); + Response deleteFile = UtilIT.deleteFile(fileId, apiToken); + deleteFile.prettyPrint(); + deleteFile.then().assertThat() + .statusCode(NO_CONTENT.getStatusCode()); + + Response deleteDatasetResponse = UtilIT.deleteDatasetViaNativeApi(datasetId, apiToken); + deleteDatasetResponse.prettyPrint(); + + Response deleteDataverseResponse = UtilIT.deleteDataverse(dataverseAlias, apiToken); + deleteDataverseResponse.prettyPrint(); + + Response deleteUserResponse = UtilIT.deleteUser(username); + deleteUserResponse.prettyPrint(); + + } + + @After + public void tearDownDataverse() { + } + + @AfterClass + public static void cleanup() { + } + +} diff --git a/tests/jenkins/ec2/Jenkinsfile b/tests/jenkins/ec2/Jenkinsfile index 6d387c6e4db..4a16f865886 100644 --- a/tests/jenkins/ec2/Jenkinsfile +++ b/tests/jenkins/ec2/Jenkinsfile @@ -41,6 +41,13 @@ pipeline { sourcePattern: 'src/main/java', exclusionPattern: 'src/test*'] ) + script { + if (fileExists('./ansible_complete')) { + sh '/bin/rm ./ansible_complete' + } else { + error('Ansible run terminated abnormally, failing build.') + } + } } } }