From 0b375cfb38c5303c18045a80aaf347977414b8ef Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Tue, 18 Oct 2022 17:43:55 -0400 Subject: [PATCH 01/22] add type for person/org, add sameas, fix affiliation --- .../harvard/iq/dataverse/DatasetVersion.java | 50 +++++++++---- .../iq/dataverse/util/PersonOrOrgUtil.java | 72 +++++++++++++++++++ 2 files changed, 107 insertions(+), 15 deletions(-) create mode 100644 src/main/java/edu/harvard/iq/dataverse/util/PersonOrOrgUtil.java diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java b/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java index 30815c43381..64371148254 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java @@ -1,6 +1,7 @@ package edu.harvard.iq.dataverse; import edu.harvard.iq.dataverse.util.MarkupChecker; +import edu.harvard.iq.dataverse.util.PersonOrOrgUtil; import edu.harvard.iq.dataverse.util.BundleUtil; import edu.harvard.iq.dataverse.DatasetFieldType.FieldType; import edu.harvard.iq.dataverse.branding.BrandingUtil; @@ -1802,27 +1803,46 @@ public String getJsonLd() { for (DatasetAuthor datasetAuthor : this.getDatasetAuthors()) { JsonObjectBuilder author = Json.createObjectBuilder(); String name = datasetAuthor.getName().getDisplayValue(); + String identifierAsUrl = datasetAuthor.getIdentifierAsUrl(); DatasetField authorAffiliation = datasetAuthor.getAffiliation(); String affiliation = null; if (authorAffiliation != null) { affiliation = datasetAuthor.getAffiliation().getDisplayValue(); } - // We are aware of "givenName" and "familyName" but instead of a person it might be an organization such as "Gallup Organization". - //author.add("@type", "Person"); - author.add("name", name); - // We are aware that the following error is thrown by https://search.google.com/structured-data/testing-tool - // "The property affiliation is not recognized by Google for an object of type Thing." - // Someone at Google has said this is ok. - // This logic could be moved into the `if (authorAffiliation != null)` block above. - if (!StringUtil.isEmpty(affiliation)) { - author.add("affiliation", affiliation); - } - String identifierAsUrl = datasetAuthor.getIdentifierAsUrl(); - if (identifierAsUrl != null) { - // It would be valid to provide an array of identifiers for authors but we have decided to only provide one. - author.add("@id", identifierAsUrl); - author.add("identifier", identifierAsUrl); + JsonObject entity = PersonOrOrgUtil.getPersonOrOrganization(name, (identifierAsUrl==null)); + String givenName= entity.getString("givenName"); + String familyName= entity.getString("familyName"); + + if (entity.getBoolean("isPerson")) { + // Person + author.add("@type", "Person"); + if (givenName != null) { + author.add("givenName", givenName); + } + if (familyName != null) { + author.add("familyName", familyName); + } + if (!StringUtil.isEmpty(affiliation)) { + author.add("affiliation", Json.createObjectBuilder().add("@type", "Organization").add("name", affiliation)); + } + //Currently all possible identifier URLs are for people not Organizations + if(identifierAsUrl != null) { + author.add("sameas", identifierAsUrl); + //Legacy - not sure if these are still useful + author.add("@id", identifierAsUrl); + author.add("identifier", identifierAsUrl); + + } + } else { + // Organization + author.add("@type", "Organization"); + if (!StringUtil.isEmpty(affiliation)) { + author.add("parentOrganization", Json.createObjectBuilder().add("@type", "Organization").add("name", affiliation)); + } } + // Both cases + author.add("name", entity.getString("name")); + //And add to the array authors.add(author); } JsonArray authorsArray = authors.build(); diff --git a/src/main/java/edu/harvard/iq/dataverse/util/PersonOrOrgUtil.java b/src/main/java/edu/harvard/iq/dataverse/util/PersonOrOrgUtil.java new file mode 100644 index 00000000000..8d767d2e535 --- /dev/null +++ b/src/main/java/edu/harvard/iq/dataverse/util/PersonOrOrgUtil.java @@ -0,0 +1,72 @@ +package edu.harvard.iq.dataverse.util; + +import javax.json.JsonObject; +import javax.json.JsonObjectBuilder; + +import edu.harvard.iq.dataverse.export.openaire.Cleanup; +import edu.harvard.iq.dataverse.export.openaire.FirstNames; +import edu.harvard.iq.dataverse.export.openaire.Organizations; +import edu.harvard.iq.dataverse.util.json.NullSafeJsonBuilder; + +/** + * + * @author qqmyers + * + * Adapted from earlier code in OpenAireExportUtil + * + * Implements an algorithm derived from code at DataCite to determine + * whether a name is that of a Person or Organization and, if the + * former, to pull out the given and family names. + */ + +public class PersonOrOrgUtil { + + public static JsonObject getPersonOrOrganization(String name, boolean organizationIfTied) { + name = Cleanup.normalize(name); + + String givenName = null; + String familyName = null; + // Datacite algorithm, + // https://github.com/IQSS/dataverse/issues/2243#issuecomment-358615313 + boolean isOrganization = Organizations.getInstance().isOrganization(name); + if (name.contains(",")) { + givenName = FirstNames.getInstance().getFirstName(name); + // contributorName=, + if (givenName != null && !isOrganization) { + // givenName ok + isOrganization = false; + // contributor_map.put("nameType", "Personal"); + if (!name.replaceFirst(",", "").contains(",")) { + // contributorName=, + String[] fullName = name.split(", "); + givenName = fullName[1]; + familyName = fullName[0]; + } + } else if (isOrganization || organizationIfTied) { + isOrganization = true; + } + + } else { + givenName = FirstNames.getInstance().getFirstName(name); + + if (givenName != null && !isOrganization) { + isOrganization = false; + if (givenName.length() + 1 < name.length()) { + familyName = name.substring(givenName.length() + 1); + } + } else { + // default + if (isOrganization || organizationIfTied) { + isOrganization = true; + } + } + } + JsonObjectBuilder job = new NullSafeJsonBuilder(); + job.add("fullname", name); + job.add("givenName", givenName); + job.add("familyName", familyName); + job.add("isPerson", !isOrganization); + return job.build(); + + } +} From 5bd58d8f4390fc4eed31ba8f64835b527ca939fb Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Tue, 18 Oct 2022 18:02:01 -0400 Subject: [PATCH 02/22] typo --- src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java b/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java index 64371148254..2aca5cc9705 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java @@ -1841,7 +1841,7 @@ public String getJsonLd() { } } // Both cases - author.add("name", entity.getString("name")); + author.add("name", entity.getString("fullname")); //And add to the array authors.add(author); } From 63cd77d2a834221889125fbda952bd193e44d099 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Wed, 19 Oct 2022 13:34:46 -0400 Subject: [PATCH 03/22] capitalization --- src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java b/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java index 2aca5cc9705..8aaf0d2fd89 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java @@ -1827,7 +1827,7 @@ public String getJsonLd() { } //Currently all possible identifier URLs are for people not Organizations if(identifierAsUrl != null) { - author.add("sameas", identifierAsUrl); + author.add("sameAs", identifierAsUrl); //Legacy - not sure if these are still useful author.add("@id", identifierAsUrl); author.add("identifier", identifierAsUrl); From 8084fb8796700eb56fdbcc17a6b792946875a5f0 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Wed, 19 Oct 2022 13:37:45 -0400 Subject: [PATCH 04/22] update tests --- .../iq/dataverse/export/SchemaDotOrgExporterTest.java | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/test/java/edu/harvard/iq/dataverse/export/SchemaDotOrgExporterTest.java b/src/test/java/edu/harvard/iq/dataverse/export/SchemaDotOrgExporterTest.java index b5453e75fe5..2327de43ca4 100644 --- a/src/test/java/edu/harvard/iq/dataverse/export/SchemaDotOrgExporterTest.java +++ b/src/test/java/edu/harvard/iq/dataverse/export/SchemaDotOrgExporterTest.java @@ -137,13 +137,15 @@ public void testExportDataset() throws Exception { assertEquals("https://doi.org/10.5072/FK2/IMK5A4", json2.getString("identifier")); assertEquals("Darwin's Finches", json2.getString("name")); assertEquals("Finch, Fiona", json2.getJsonArray("creator").getJsonObject(0).getString("name")); - assertEquals("Birds Inc.", json2.getJsonArray("creator").getJsonObject(0).getString("affiliation")); + assertEquals("Birds Inc.", json2.getJsonArray("creator").getJsonObject(0).getJsonObject("affiliation").getString("name")); assertEquals("https://orcid.org/0000-0002-1825-0097", json2.getJsonArray("creator").getJsonObject(0).getString("@id")); assertEquals("https://orcid.org/0000-0002-1825-0097", json2.getJsonArray("creator").getJsonObject(0).getString("identifier")); + assertEquals("https://orcid.org/0000-0002-1825-0097", json2.getJsonArray("creator").getJsonObject(0).getString("sameAs")); assertEquals("Finch, Fiona", json2.getJsonArray("author").getJsonObject(0).getString("name")); - assertEquals("Birds Inc.", json2.getJsonArray("author").getJsonObject(0).getString("affiliation")); + assertEquals("Birds Inc.", json2.getJsonArray("author").getJsonObject(0).getJsonObject("affiliation").getString("name")); assertEquals("https://orcid.org/0000-0002-1825-0097", json2.getJsonArray("author").getJsonObject(0).getString("@id")); assertEquals("https://orcid.org/0000-0002-1825-0097", json2.getJsonArray("author").getJsonObject(0).getString("identifier")); + assertEquals("https://orcid.org/0000-0002-1825-0097", json2.getJsonArray("author").getJsonObject(0).getString("sameAs")); assertEquals("1955-11-05", json2.getString("datePublished")); assertEquals("1955-11-05", json2.getString("dateModified")); assertEquals("1", json2.getString("version")); From 489d0e36e9b81c095b7387522d95b92516c00b69 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Wed, 19 Oct 2022 13:40:43 -0400 Subject: [PATCH 05/22] legacy test issue --- .../harvard/iq/dataverse/export/SchemaDotOrgExporterTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/java/edu/harvard/iq/dataverse/export/SchemaDotOrgExporterTest.java b/src/test/java/edu/harvard/iq/dataverse/export/SchemaDotOrgExporterTest.java index 2327de43ca4..68bab7c8bb4 100644 --- a/src/test/java/edu/harvard/iq/dataverse/export/SchemaDotOrgExporterTest.java +++ b/src/test/java/edu/harvard/iq/dataverse/export/SchemaDotOrgExporterTest.java @@ -170,7 +170,7 @@ public void testExportDataset() throws Exception { assertEquals("LibraScholar", json2.getJsonObject("includedInDataCatalog").getString("name")); assertEquals("https://librascholar.org", json2.getJsonObject("includedInDataCatalog").getString("url")); assertEquals("Organization", json2.getJsonObject("publisher").getString("@type")); - assertEquals("LibraScholar", json2.getJsonObject("provider").getString("name")); + assertEquals("LibraScholar", json2.getJsonObject("publisher").getString("name")); assertEquals("Organization", json2.getJsonObject("provider").getString("@type")); assertEquals("LibraScholar", json2.getJsonObject("provider").getString("name")); assertEquals("Organization", json2.getJsonArray("funder").getJsonObject(0).getString("@type")); From c3260a5009c99f0765d012d9ce20ef27048cb738 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Wed, 19 Oct 2022 17:41:21 -0400 Subject: [PATCH 06/22] change fullname -> fullName --- src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java | 2 +- .../java/edu/harvard/iq/dataverse/util/PersonOrOrgUtil.java | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java b/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java index 8aaf0d2fd89..8e9a0950b2a 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java @@ -1841,7 +1841,7 @@ public String getJsonLd() { } } // Both cases - author.add("name", entity.getString("fullname")); + author.add("name", entity.getString("fullName")); //And add to the array authors.add(author); } diff --git a/src/main/java/edu/harvard/iq/dataverse/util/PersonOrOrgUtil.java b/src/main/java/edu/harvard/iq/dataverse/util/PersonOrOrgUtil.java index 8d767d2e535..add5c8285ae 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/PersonOrOrgUtil.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/PersonOrOrgUtil.java @@ -62,7 +62,7 @@ public static JsonObject getPersonOrOrganization(String name, boolean organizati } } JsonObjectBuilder job = new NullSafeJsonBuilder(); - job.add("fullname", name); + job.add("fullName", name); job.add("givenName", givenName); job.add("familyName", familyName); job.add("isPerson", !isOrganization); From 3ddc7960f24a63bf322d90befd71c3c440ab3101 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Wed, 19 Oct 2022 17:41:31 -0400 Subject: [PATCH 07/22] note todos --- .../export/openaire/OpenAireExportUtil.java | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/export/openaire/OpenAireExportUtil.java b/src/main/java/edu/harvard/iq/dataverse/export/openaire/OpenAireExportUtil.java index 49fe203b96d..bea3858a60e 100644 --- a/src/main/java/edu/harvard/iq/dataverse/export/openaire/OpenAireExportUtil.java +++ b/src/main/java/edu/harvard/iq/dataverse/export/openaire/OpenAireExportUtil.java @@ -256,7 +256,10 @@ public static void writeCreatorsElement(XMLStreamWriter xmlw, DatasetVersionDTO creator_map.put("nameType", "Personal"); nameType_check = true; } - + // ToDo - the algorithm to determine if this is a Person or Organization here + // has been abstracted into a separate + // edu.harvard.iq.dataverse.util.PersonOrOrgUtil class that could be used here + // to avoid duplication/variants of the algorithm creatorName = Cleanup.normalize(creatorName); // Datacite algorithm, https://github.com/IQSS/dataverse/issues/2243#issuecomment-358615313 if (creatorName.contains(",")) { @@ -706,6 +709,11 @@ public static void writeContributorElement(XMLStreamWriter xmlw, String contribu boolean nameType_check = false; Map contributor_map = new HashMap(); + // ToDo - the algorithm to determine if this is a Person or Organization here + // has been abstracted into a separate + // edu.harvard.iq.dataverse.util.PersonOrOrgUtil class that could be used here + // to avoid duplication/variants of the algorithm + contributorName = Cleanup.normalize(contributorName); // Datacite algorithm, https://github.com/IQSS/dataverse/issues/2243#issuecomment-358615313 if (contributorName.contains(",")) { @@ -717,6 +725,9 @@ public static void writeContributorElement(XMLStreamWriter xmlw, String contribu // givenName ok contributor_map.put("nameType", "Personal"); nameType_check = true; + // re: the above toDo - the ("ContactPerson".equals(contributorType) && + // !isValidEmailAddress(contributorName)) clause in the next line could/should + // be sent as the OrgIfTied boolean parameter } else if (isOrganization || ("ContactPerson".equals(contributorType) && !isValidEmailAddress(contributorName))) { contributor_map.put("nameType", "Organizational"); } From 05ea63aa98a7c896fbfbbfa00eb4c6755bd317ad Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Wed, 19 Oct 2022 17:42:27 -0400 Subject: [PATCH 08/22] add tests same examples as in OrganizationTest but using the extracted algorithm and also checking given/family name in relevant cases --- .../dataverse/util/PersonOrOrgUtilTest.java | 92 +++++++++++++++++++ 1 file changed, 92 insertions(+) create mode 100644 src/test/java/edu/harvard/iq/dataverse/util/PersonOrOrgUtilTest.java diff --git a/src/test/java/edu/harvard/iq/dataverse/util/PersonOrOrgUtilTest.java b/src/test/java/edu/harvard/iq/dataverse/util/PersonOrOrgUtilTest.java new file mode 100644 index 00000000000..32c72e9497c --- /dev/null +++ b/src/test/java/edu/harvard/iq/dataverse/util/PersonOrOrgUtilTest.java @@ -0,0 +1,92 @@ +package edu.harvard.iq.dataverse.util; + +import edu.harvard.iq.dataverse.export.openaire.Organizations; +import edu.harvard.iq.dataverse.util.json.JsonUtil; + +import org.junit.Ignore; +import org.junit.Test; +import static org.junit.Assert.*; + +import javax.json.JsonObject; + +public class PersonOrOrgUtilTest { + + public PersonOrOrgUtilTest() { + } + + @Test + public void testOrganizationSimpleName() { + verifyIsOrganization("IBM"); + verifyIsOrganization("Harvard University"); + } + + @Test + public void testOrganizationCOMPLEXName() { + verifyIsOrganization("The Institute for Quantitative Social Science"); + verifyIsOrganization("Council on Aging"); + verifyIsOrganization("The Ford Foundation"); + verifyIsOrganization("United Nations Economic and Social Commission for Asia and the Pacific (UNESCAP)"); + verifyIsOrganization("Michael J. Fox Foundation for Parkinson's Research"); + } + + @Test + public void testOrganizationComaOrDash() { + verifyIsOrganization("Digital Archive of Massachusetts Anti-Slavery and Anti-Segregation Petitions, Massachusetts Archives, Boston MA"); + verifyIsOrganization("U.S. Department of Commerce, Bureau of the Census, Geography Division"); + verifyIsOrganization("Harvard Map Collection, Harvard College Library"); + verifyIsOrganization("Geographic Data Technology, Inc. (GDT)"); + } + + @Ignore + @Test + public void testOrganizationES() { + //Spanish recognition is not enabled - see export/Organization.java + verifyIsOrganization("Compañía de San Fernando"); + } + + /** + * Name is composed of: + * + */ + @Test + public void testName() { + verifyIsPerson("Jorge Mario Bergoglio", "Jorge Mario", "Bergoglio"); + verifyIsPerson("Bergoglio", null, null); + verifyIsPerson("Francesco Cadili", "Francesco", "Cadili"); + // This Philip Seymour Hoffman example is from ShibUtilTest. + verifyIsPerson("Philip Seymour Hoffman", "Philip Seymour", "Hoffman"); + + // test Smith (is also a name) + verifyIsPerson("John Smith", "John", "Smith"); + // resolved using hint file + verifyIsPerson("Guido van Rossum", "Guido", "van Rossum"); + // test only name + verifyIsPerson("Francesco", "Francesco", null); + // test only family name + verifyIsPerson("Cadili", null, null); + } + + private void verifyIsOrganization(String fullName) { + JsonObject obj = PersonOrOrgUtil.getPersonOrOrganization(fullName, false); + System.out.println(JsonUtil.prettyPrint(obj)); + assertEquals(obj.getString("fullName"),fullName); + assertFalse(obj.getBoolean("isPerson")); + + } + + private void verifyIsPerson(String fullName, String givenName, String familyName) { + JsonObject obj = PersonOrOrgUtil.getPersonOrOrganization(fullName, false); + System.out.println(JsonUtil.prettyPrint(obj)); + assertEquals(obj.getString("fullName"),fullName); + assertTrue(obj.getBoolean("isPerson")); + assertEquals(obj.containsKey("givenName"), givenName != null); + if(obj.containsKey("givenName") && givenName != null) { + assertEquals(obj.getString("givenName"),givenName); + } + assertEquals(obj.containsKey("familyName"), familyName != null); + if(obj.containsKey("familyName") && familyName != null) { + assertEquals(obj.getString("familyName"),familyName); + } + } + + } From 6ca9f7099698bcfe08f6fbc98379f3f989d6a283 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Wed, 19 Oct 2022 17:44:02 -0400 Subject: [PATCH 09/22] don't send giveName for orgs it does not appear to be useful given the tests in PersonOrOrgUtilTest --- .../java/edu/harvard/iq/dataverse/util/PersonOrOrgUtil.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/PersonOrOrgUtil.java b/src/main/java/edu/harvard/iq/dataverse/util/PersonOrOrgUtil.java index add5c8285ae..468949e8a40 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/PersonOrOrgUtil.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/PersonOrOrgUtil.java @@ -44,6 +44,7 @@ public static JsonObject getPersonOrOrganization(String name, boolean organizati } } else if (isOrganization || organizationIfTied) { isOrganization = true; + givenName=null; } } else { @@ -58,6 +59,7 @@ public static JsonObject getPersonOrOrganization(String name, boolean organizati // default if (isOrganization || organizationIfTied) { isOrganization = true; + givenName=null; } } } From a5fafd079d64ed334fa45fa238765815bd262f05 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Wed, 19 Oct 2022 17:48:12 -0400 Subject: [PATCH 10/22] release note --- doc/release-notes/7349-4-schema.org-updates.md | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 doc/release-notes/7349-4-schema.org-updates.md diff --git a/doc/release-notes/7349-4-schema.org-updates.md b/doc/release-notes/7349-4-schema.org-updates.md new file mode 100644 index 00000000000..1247471f137 --- /dev/null +++ b/doc/release-notes/7349-4-schema.org-updates.md @@ -0,0 +1,3 @@ +The Schema.org metadata export and the schema.org metadata embedded in dataset pages has been updated to improve compliance with Schema.org's schema and Google's recommendations. + +Backward compatibility - author/creators now have an @type of Person or Organization and any affiliation (affiliation for Person, parentOrganization for Organization) is now an object of @type Organization \ No newline at end of file From f222160d16705b99f1e942037fc68828732f9934 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Thu, 20 Oct 2022 18:06:22 -0400 Subject: [PATCH 11/22] bugfix for no givenName/familyName from algorithm --- src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java b/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java index 8e9a0950b2a..278ab246fcf 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java @@ -1810,8 +1810,8 @@ public String getJsonLd() { affiliation = datasetAuthor.getAffiliation().getDisplayValue(); } JsonObject entity = PersonOrOrgUtil.getPersonOrOrganization(name, (identifierAsUrl==null)); - String givenName= entity.getString("givenName"); - String familyName= entity.getString("familyName"); + String givenName= entity.containsKey("givenName") ? entity.getString("givenName"):null; + String familyName= entity.containsKey("familyName")? entity.getString("familyName"):null; if (entity.getBoolean("isPerson")) { // Person From 41c30d9de4970b57f8547dbb443d594aefc92e9e Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Fri, 21 Oct 2022 15:22:30 -0400 Subject: [PATCH 12/22] add assumeCommaInPersonName and tests --- .../harvard/iq/dataverse/DatasetVersion.java | 2 +- .../iq/dataverse/util/PersonOrOrgUtil.java | 51 ++++++++++++++----- .../dataverse/util/PersonOrOrgUtilTest.java | 14 ++++- 3 files changed, 53 insertions(+), 14 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java b/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java index 278ab246fcf..1204d1dd4f1 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java @@ -1811,7 +1811,7 @@ public String getJsonLd() { } JsonObject entity = PersonOrOrgUtil.getPersonOrOrganization(name, (identifierAsUrl==null)); String givenName= entity.containsKey("givenName") ? entity.getString("givenName"):null; - String familyName= entity.containsKey("familyName")? entity.getString("familyName"):null; + String familyName= entity.containsKey("familyName") ? entity.getString("familyName"):null; if (entity.getBoolean("isPerson")) { // Person diff --git a/src/main/java/edu/harvard/iq/dataverse/util/PersonOrOrgUtil.java b/src/main/java/edu/harvard/iq/dataverse/util/PersonOrOrgUtil.java index 468949e8a40..b8089422fcd 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/PersonOrOrgUtil.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/PersonOrOrgUtil.java @@ -17,18 +17,37 @@ * Implements an algorithm derived from code at DataCite to determine * whether a name is that of a Person or Organization and, if the * former, to pull out the given and family names. + * + * Adds a parameter that can improve accuracy, e.g. for curated + * repositories, allowing the code to assume that all Person entries are + * in , order. + * + * Possible ToDo - one could also allow local configuration of specific + * words that will automatically categorize one-off cases that the + * algorithm would otherwise mis-categorize. For example, the code + * appears to not recognize names ending in "Project" as an + * Organization. + * */ public class PersonOrOrgUtil { + static boolean assumeCommaInPersonName = false; + + static { + setAssumeCommaInPersonName(Boolean.parseBoolean(System.getProperty("dataverse.personOrOrg.assumeCommaInPersonName", "false"))); + } + public static JsonObject getPersonOrOrganization(String name, boolean organizationIfTied) { name = Cleanup.normalize(name); String givenName = null; String familyName = null; - // Datacite algorithm, + // adapted from a Datacite algorithm, // https://github.com/IQSS/dataverse/issues/2243#issuecomment-358615313 boolean isOrganization = Organizations.getInstance().isOrganization(name); + // ToDo - could add a check of stop words to handle problem cases, i.e. if name + // contains something in that list, it is an org if (name.contains(",")) { givenName = FirstNames.getInstance().getFirstName(name); // contributorName=, @@ -48,18 +67,21 @@ public static JsonObject getPersonOrOrganization(String name, boolean organizati } } else { - givenName = FirstNames.getInstance().getFirstName(name); - - if (givenName != null && !isOrganization) { - isOrganization = false; - if (givenName.length() + 1 < name.length()) { - familyName = name.substring(givenName.length() + 1); - } + if (assumeCommaInPersonName) { + isOrganization = true; } else { - // default - if (isOrganization || organizationIfTied) { - isOrganization = true; - givenName=null; + givenName = FirstNames.getInstance().getFirstName(name); + + if (givenName != null && !isOrganization) { + isOrganization = false; + if (givenName.length() + 1 < name.length()) { + familyName = name.substring(givenName.length() + 1); + } + } else { + // default + if (isOrganization || organizationIfTied) { + isOrganization = true; + } } } } @@ -71,4 +93,9 @@ public static JsonObject getPersonOrOrganization(String name, boolean organizati return job.build(); } + + public static void setAssumeCommaInPersonName(boolean assume) { + assumeCommaInPersonName = assume; + } + } diff --git a/src/test/java/edu/harvard/iq/dataverse/util/PersonOrOrgUtilTest.java b/src/test/java/edu/harvard/iq/dataverse/util/PersonOrOrgUtilTest.java index 32c72e9497c..dbda622b536 100644 --- a/src/test/java/edu/harvard/iq/dataverse/util/PersonOrOrgUtilTest.java +++ b/src/test/java/edu/harvard/iq/dataverse/util/PersonOrOrgUtilTest.java @@ -30,7 +30,19 @@ public void testOrganizationCOMPLEXName() { } @Test - public void testOrganizationComaOrDash() { + public void testOrganizationAcademicName() { + + verifyIsOrganization("John Smith Center"); + verifyIsOrganization("John Smith Group"); + //An example the base algorithm doesn't handle: + PersonOrOrgUtil.setAssumeCommaInPersonName(true); + verifyIsOrganization("John Smith Project"); + PersonOrOrgUtil.setAssumeCommaInPersonName(false); + } + + + @Test + public void testOrganizationCommaOrDash() { verifyIsOrganization("Digital Archive of Massachusetts Anti-Slavery and Anti-Segregation Petitions, Massachusetts Archives, Boston MA"); verifyIsOrganization("U.S. Department of Commerce, Bureau of the Census, Geography Division"); verifyIsOrganization("Harvard Map Collection, Harvard College Library"); From d5d365589f627bedd529cbb93be5af33ae63e560 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Fri, 21 Oct 2022 17:02:44 -0400 Subject: [PATCH 13/22] update docs/release note --- doc/release-notes/7349-4-schema.org-updates.md | 2 ++ doc/sphinx-guides/source/installation/config.rst | 10 ++++++++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/doc/release-notes/7349-4-schema.org-updates.md b/doc/release-notes/7349-4-schema.org-updates.md index 1247471f137..2c78243dc29 100644 --- a/doc/release-notes/7349-4-schema.org-updates.md +++ b/doc/release-notes/7349-4-schema.org-updates.md @@ -1,3 +1,5 @@ The Schema.org metadata export and the schema.org metadata embedded in dataset pages has been updated to improve compliance with Schema.org's schema and Google's recommendations. +New jvm-option: dataverse.personOrOrg.assumeCommaInPersonName, default is false + Backward compatibility - author/creators now have an @type of Person or Organization and any affiliation (affiliation for Person, parentOrganization for Organization) is now an object of @type Organization \ No newline at end of file diff --git a/doc/sphinx-guides/source/installation/config.rst b/doc/sphinx-guides/source/installation/config.rst index f2de9d5702f..3e01f372c9b 100644 --- a/doc/sphinx-guides/source/installation/config.rst +++ b/doc/sphinx-guides/source/installation/config.rst @@ -1627,8 +1627,6 @@ By default, download URLs to files will be included in Schema.org JSON-LD output ``./asadmin create-jvm-options '-Ddataverse.files.hide-schema-dot-org-download-urls=true'`` -Please note that there are other reasons why download URLs may not be included for certain files such as if a guestbook entry is required or if the file is restricted. - For more on Schema.org JSON-LD, see the :doc:`/admin/metadataexport` section of the Admin Guide. .. _useripaddresssourceheader: @@ -1658,6 +1656,14 @@ This setting is useful in cases such as running your Dataverse installation behi "HTTP_FORWARDED", "HTTP_VIA", "REMOTE_ADDR" + +dataverse.personOrOrg.assumeCommaInPersonName ++++++++++++++++++++++++++++++++++++++++++++++ + +Please note that this setting is experimental. + +The Schema.org metadata export and the Schema.org metadata included in DatasetPages tries to infer whether each entry in the Author field is a Person or Organization. If you are sure that +users are following the guidance to add people in the recommended family name, given name order, with a comma, you can set this true to always assume entries without a comma are for Organizations. The default is false. .. _:ApplicationServerSettings: From ebb138042f8b9134482a0a2119b9008f76ab80a1 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Fri, 28 Oct 2022 09:57:04 -0400 Subject: [PATCH 14/22] added org Phrases for DANS vs creating a second PR --- .../harvard/iq/dataverse/DatasetVersion.java | 2 +- .../iq/dataverse/util/PersonOrOrgUtil.java | 82 +++++++++++++++---- .../iq/dataverse/util/json/JsonUtil.java | 6 ++ .../dataverse/util/PersonOrOrgUtilTest.java | 18 +++- 4 files changed, 90 insertions(+), 18 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java b/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java index 1204d1dd4f1..c374204f73f 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java @@ -1809,7 +1809,7 @@ public String getJsonLd() { if (authorAffiliation != null) { affiliation = datasetAuthor.getAffiliation().getDisplayValue(); } - JsonObject entity = PersonOrOrgUtil.getPersonOrOrganization(name, (identifierAsUrl==null)); + JsonObject entity = PersonOrOrgUtil.getPersonOrOrganization(name, false, (identifierAsUrl==null)); String givenName= entity.containsKey("givenName") ? entity.getString("givenName"):null; String familyName= entity.containsKey("familyName") ? entity.getString("familyName"):null; diff --git a/src/main/java/edu/harvard/iq/dataverse/util/PersonOrOrgUtil.java b/src/main/java/edu/harvard/iq/dataverse/util/PersonOrOrgUtil.java index b8089422fcd..497cc689983 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/PersonOrOrgUtil.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/PersonOrOrgUtil.java @@ -1,11 +1,18 @@ package edu.harvard.iq.dataverse.util; +import java.util.ArrayList; +import java.util.List; +import java.util.logging.Logger; + +import javax.json.JsonArray; import javax.json.JsonObject; import javax.json.JsonObjectBuilder; +import javax.json.JsonString; import edu.harvard.iq.dataverse.export.openaire.Cleanup; import edu.harvard.iq.dataverse.export.openaire.FirstNames; import edu.harvard.iq.dataverse.export.openaire.Organizations; +import edu.harvard.iq.dataverse.util.json.JsonUtil; import edu.harvard.iq.dataverse.util.json.NullSafeJsonBuilder; /** @@ -18,36 +25,63 @@ * whether a name is that of a Person or Organization and, if the * former, to pull out the given and family names. * - * Adds a parameter that can improve accuracy, e.g. for curated - * repositories, allowing the code to assume that all Person entries are - * in , order. + * Adds parameters that can improve accuracy: + * + * * e.g. for curated repositories, allowing the code to assume that all + * Person entries are in , order. * - * Possible ToDo - one could also allow local configuration of specific - * words that will automatically categorize one-off cases that the - * algorithm would otherwise mis-categorize. For example, the code - * appears to not recognize names ending in "Project" as an - * Organization. + * * allow local configuration of specific words/phrases that will + * automatically categorize one-off cases that the algorithm would + * otherwise mis-categorize. For example, the code appears to not + * recognize names ending in "Project" as an Organization. * */ public class PersonOrOrgUtil { + private static final Logger logger = Logger.getLogger(PersonOrOrgUtil.class.getCanonicalName()); + static boolean assumeCommaInPersonName = false; + static List orgPhrases; static { setAssumeCommaInPersonName(Boolean.parseBoolean(System.getProperty("dataverse.personOrOrg.assumeCommaInPersonName", "false"))); + setOrgPhraseArray(System.getProperty("dataverse.personOrOrg.orgPhraseArray", null)); } - public static JsonObject getPersonOrOrganization(String name, boolean organizationIfTied) { + /** + * This method tries to determine if a name belongs to a person or an + * organization and, if it is a person, what the given and family names are. The + * core algorithm is adapted from a Datacite algorithm, see + * https://github.com/IQSS/dataverse/issues/2243#issuecomment-358615313 + * + * @param name + * - the name to test + * @param organizationIfTied + * - if a given name isn't found, should the name be assumed to be + * from an organization. This could be a generic true/false or + * information from some non-name aspect of the entity, e.g. which + * field is in use, or whether a .edu email exists, etc. + * @param isPerson + * - if this is known to be a person due to other info (i.e. they + * have an ORCID). In this case the algorithm is just looking for + * given/family names. + * @return + */ + public static JsonObject getPersonOrOrganization(String name, boolean organizationIfTied, boolean isPerson) { name = Cleanup.normalize(name); String givenName = null; String familyName = null; - // adapted from a Datacite algorithm, - // https://github.com/IQSS/dataverse/issues/2243#issuecomment-358615313 - boolean isOrganization = Organizations.getInstance().isOrganization(name); - // ToDo - could add a check of stop words to handle problem cases, i.e. if name - // contains something in that list, it is an org + + boolean isOrganization = !isPerson && Organizations.getInstance().isOrganization(name); + if (!isOrganization) { + for (String phrase : orgPhrases) { + if (name.contains(phrase)) { + isOrganization = true; + } + } + } if (name.contains(",")) { givenName = FirstNames.getInstance().getFirstName(name); // contributorName=, @@ -63,7 +97,7 @@ public static JsonObject getPersonOrOrganization(String name, boolean organizati } } else if (isOrganization || organizationIfTied) { isOrganization = true; - givenName=null; + givenName = null; } } else { @@ -94,6 +128,24 @@ public static JsonObject getPersonOrOrganization(String name, boolean organizati } + // Public for testing + public static void setOrgPhraseArray(String phraseArray) { + orgPhrases = new ArrayList(); + if (!StringUtil.isEmpty(phraseArray)) { + try { + JsonArray phrases = JsonUtil.getJsonArray(phraseArray); + phrases.forEach(val -> { + JsonString strVal = (JsonString) val; + orgPhrases.add(strVal.getString()); + }); + } catch (Exception e) { + logger.warning("Could not parse Org phrase list"); + } + } + + } + + // Public for testing public static void setAssumeCommaInPersonName(boolean assume) { assumeCommaInPersonName = assume; } diff --git a/src/main/java/edu/harvard/iq/dataverse/util/json/JsonUtil.java b/src/main/java/edu/harvard/iq/dataverse/util/json/JsonUtil.java index f4a3c635f8b..21ff0e03773 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/json/JsonUtil.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/json/JsonUtil.java @@ -63,4 +63,10 @@ public static javax.json.JsonObject getJsonObject(String serializedJson) { return Json.createReader(rdr).readObject(); } } + + public static javax.json.JsonArray getJsonArray(String serializedJson) { + try (StringReader rdr = new StringReader(serializedJson)) { + return Json.createReader(rdr).readArray(); + } + } } diff --git a/src/test/java/edu/harvard/iq/dataverse/util/PersonOrOrgUtilTest.java b/src/test/java/edu/harvard/iq/dataverse/util/PersonOrOrgUtilTest.java index dbda622b536..b22f18ca787 100644 --- a/src/test/java/edu/harvard/iq/dataverse/util/PersonOrOrgUtilTest.java +++ b/src/test/java/edu/harvard/iq/dataverse/util/PersonOrOrgUtilTest.java @@ -27,6 +27,16 @@ public void testOrganizationCOMPLEXName() { verifyIsOrganization("The Ford Foundation"); verifyIsOrganization("United Nations Economic and Social Commission for Asia and the Pacific (UNESCAP)"); verifyIsOrganization("Michael J. Fox Foundation for Parkinson's Research"); + // The next example is one known to be asserted to be a Person without an entry + // in the OrgWordArray + // So we test with it in the array and then when the array is empty to verify + // the array works, resetting the array works, and the problem still exists in + // the underlying algorithm + PersonOrOrgUtil.setOrgPhraseArray("[\"Portable\"]"); + verifyIsOrganization("Portable Antiquities of the Netherlands"); + PersonOrOrgUtil.setOrgPhraseArray(null); + JsonObject obj = PersonOrOrgUtil.getPersonOrOrganization("Portable Antiquities of the Netherlands", false, false); + assertTrue(obj.getBoolean("isPerson")); } @Test @@ -79,7 +89,7 @@ public void testName() { } private void verifyIsOrganization(String fullName) { - JsonObject obj = PersonOrOrgUtil.getPersonOrOrganization(fullName, false); + JsonObject obj = PersonOrOrgUtil.getPersonOrOrganization(fullName, false, false); System.out.println(JsonUtil.prettyPrint(obj)); assertEquals(obj.getString("fullName"),fullName); assertFalse(obj.getBoolean("isPerson")); @@ -87,7 +97,11 @@ private void verifyIsOrganization(String fullName) { } private void verifyIsPerson(String fullName, String givenName, String familyName) { - JsonObject obj = PersonOrOrgUtil.getPersonOrOrganization(fullName, false); + verifyIsPerson(fullName, givenName, familyName, false); + } + + private void verifyIsPerson(String fullName, String givenName, String familyName, boolean isPerson) { + JsonObject obj = PersonOrOrgUtil.getPersonOrOrganization(fullName, false, isPerson); System.out.println(JsonUtil.prettyPrint(obj)); assertEquals(obj.getString("fullName"),fullName); assertTrue(obj.getBoolean("isPerson")); From 4dcd8ed8e68807fd0381170f8260b381383b3171 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Fri, 28 Oct 2022 11:14:34 -0400 Subject: [PATCH 15/22] fix affiliation value (no parens) --- src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java b/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java index c374204f73f..b7eca85e95b 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java @@ -1807,7 +1807,7 @@ public String getJsonLd() { DatasetField authorAffiliation = datasetAuthor.getAffiliation(); String affiliation = null; if (authorAffiliation != null) { - affiliation = datasetAuthor.getAffiliation().getDisplayValue(); + affiliation = datasetAuthor.getAffiliation().getValue(); } JsonObject entity = PersonOrOrgUtil.getPersonOrOrganization(name, false, (identifierAsUrl==null)); String givenName= entity.containsKey("givenName") ? entity.getString("givenName"):null; From 0184b3d9afce7d83db4c6b0bb6e5956f0daa8b4b Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Fri, 28 Oct 2022 12:19:23 -0400 Subject: [PATCH 16/22] logic fix --- src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java b/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java index b7eca85e95b..061712f6864 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java @@ -1809,7 +1809,7 @@ public String getJsonLd() { if (authorAffiliation != null) { affiliation = datasetAuthor.getAffiliation().getValue(); } - JsonObject entity = PersonOrOrgUtil.getPersonOrOrganization(name, false, (identifierAsUrl==null)); + JsonObject entity = PersonOrOrgUtil.getPersonOrOrganization(name, false, (identifierAsUrl!=null)); String givenName= entity.containsKey("givenName") ? entity.getString("givenName"):null; String familyName= entity.containsKey("familyName") ? entity.getString("familyName"):null; From 545a295764e71f63dc0b3d6480805801f1ef51f6 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Fri, 28 Oct 2022 12:40:05 -0400 Subject: [PATCH 17/22] comma check shouldn't override isPerson --- .../java/edu/harvard/iq/dataverse/util/PersonOrOrgUtil.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/PersonOrOrgUtil.java b/src/main/java/edu/harvard/iq/dataverse/util/PersonOrOrgUtil.java index 497cc689983..bacbb705721 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/PersonOrOrgUtil.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/PersonOrOrgUtil.java @@ -101,7 +101,7 @@ public static JsonObject getPersonOrOrganization(String name, boolean organizati } } else { - if (assumeCommaInPersonName) { + if (assumeCommaInPersonName && !isPerson) { isOrganization = true; } else { givenName = FirstNames.getInstance().getFirstName(name); From ab2326c38aef3f76d1ee824606fcad8c73bc2944 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Fri, 28 Oct 2022 13:18:29 -0400 Subject: [PATCH 18/22] always set givenName null for Org --- src/main/java/edu/harvard/iq/dataverse/util/PersonOrOrgUtil.java | 1 + 1 file changed, 1 insertion(+) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/PersonOrOrgUtil.java b/src/main/java/edu/harvard/iq/dataverse/util/PersonOrOrgUtil.java index bacbb705721..3a8088aac77 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/PersonOrOrgUtil.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/PersonOrOrgUtil.java @@ -115,6 +115,7 @@ public static JsonObject getPersonOrOrganization(String name, boolean organizati // default if (isOrganization || organizationIfTied) { isOrganization = true; + givenName=null; } } } From 0d541064d17d4b8d64d61db617e0d541613ec711 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Fri, 28 Oct 2022 14:24:46 -0400 Subject: [PATCH 19/22] optimize - break out of loop when done --- src/main/java/edu/harvard/iq/dataverse/util/PersonOrOrgUtil.java | 1 + 1 file changed, 1 insertion(+) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/PersonOrOrgUtil.java b/src/main/java/edu/harvard/iq/dataverse/util/PersonOrOrgUtil.java index 3a8088aac77..da33fc9597e 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/PersonOrOrgUtil.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/PersonOrOrgUtil.java @@ -79,6 +79,7 @@ public static JsonObject getPersonOrOrganization(String name, boolean organizati for (String phrase : orgPhrases) { if (name.contains(phrase)) { isOrganization = true; + break; } } } From 1d935fe580284384328f8374c9f223f71916c4c6 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Fri, 28 Oct 2022 14:55:40 -0400 Subject: [PATCH 20/22] documentation of new options --- doc/sphinx-guides/source/admin/metadataexport.rst | 10 ++++++++++ doc/sphinx-guides/source/installation/config.rst | 11 ++++++++++- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/doc/sphinx-guides/source/admin/metadataexport.rst b/doc/sphinx-guides/source/admin/metadataexport.rst index 78b8c8ce223..200c3a3e342 100644 --- a/doc/sphinx-guides/source/admin/metadataexport.rst +++ b/doc/sphinx-guides/source/admin/metadataexport.rst @@ -57,3 +57,13 @@ Downloading Metadata via API ---------------------------- The :doc:`/api/native-api` section of the API Guide explains how end users can download the metadata formats above via API. + +Exporter Configuration +---------------------- + +Two exporters - Schema.org JSONLD and OpenAire - use an algorithm to determine whether an author, or contact, name belongs to a person or organization. While the algorithm works well, there are cases in which it makes mistakes, usually inferring that an organization is a person. + +The Dataverse software implements two jvm-options that can be used to tune the algorithm: + +- :ref:`dataverse.personOrOrg.assumeCommaInPersonName` - boolean, default false. If true, Dataverse will assume any name without a comma must be an organization. This may be most useful for curated Dataverse instances that enforce the "family name, given name" convention. +- :ref:`dataverse.personOrOrg.orgPhraseArray` - a JsonArray of strings. Any name that contains one of the strings is assumed to be an organization. For example, "Project" is a word that is not otherwise associated with being an organization. diff --git a/doc/sphinx-guides/source/installation/config.rst b/doc/sphinx-guides/source/installation/config.rst index 3e01f372c9b..5d4d29271f9 100644 --- a/doc/sphinx-guides/source/installation/config.rst +++ b/doc/sphinx-guides/source/installation/config.rst @@ -1662,9 +1662,18 @@ dataverse.personOrOrg.assumeCommaInPersonName Please note that this setting is experimental. -The Schema.org metadata export and the Schema.org metadata included in DatasetPages tries to infer whether each entry in the Author field is a Person or Organization. If you are sure that +The Schema.org metadata and OpenAIRE exports and the Schema.org metadata included in DatasetPages try to infer whether each entry in the various fields (e.g. Author, Contributor) is a Person or Organization. If you are sure that users are following the guidance to add people in the recommended family name, given name order, with a comma, you can set this true to always assume entries without a comma are for Organizations. The default is false. +dataverse.personOrOrg.orgPhraseArray +++++++++++++++++++++++++++++++++++++ + +Please note that this setting is experimental. + +The Schema.org metadata and OpenAIRE exports and the Schema.org metadata included in DatasetPages try to infer whether each entry in the various fields (e.g. Author, Contributor) is a Person or Organization. +If you have examples where an orgization name is being inferred to belong to a person, you can use this setting to force it to be recognized as an organization. +The value is expected to be a JsonArray of strings. Any name that contains one of the strings is assumed to be an organization. For example, "Project" is a word that is not otherwise associated with being an organization. + .. _:ApplicationServerSettings: Application Server Settings From a5ae4d782c63ba71a72f0da1748b7f62e1904434 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Fri, 28 Oct 2022 14:59:59 -0400 Subject: [PATCH 21/22] add labels --- doc/sphinx-guides/source/installation/config.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/sphinx-guides/source/installation/config.rst b/doc/sphinx-guides/source/installation/config.rst index 5d4d29271f9..96397b707ff 100644 --- a/doc/sphinx-guides/source/installation/config.rst +++ b/doc/sphinx-guides/source/installation/config.rst @@ -1657,6 +1657,8 @@ This setting is useful in cases such as running your Dataverse installation behi "HTTP_VIA", "REMOTE_ADDR" +.. _dataverse.personOrOrg.assumeCommaInPersonName: + dataverse.personOrOrg.assumeCommaInPersonName +++++++++++++++++++++++++++++++++++++++++++++ @@ -1665,6 +1667,8 @@ Please note that this setting is experimental. The Schema.org metadata and OpenAIRE exports and the Schema.org metadata included in DatasetPages try to infer whether each entry in the various fields (e.g. Author, Contributor) is a Person or Organization. If you are sure that users are following the guidance to add people in the recommended family name, given name order, with a comma, you can set this true to always assume entries without a comma are for Organizations. The default is false. +.. _dataverse.personOrOrg.orgPhraseArray: + dataverse.personOrOrg.orgPhraseArray ++++++++++++++++++++++++++++++++++++ From 863be659ca7aeaa72b5b50aa527bf812ea99dbc9 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Tue, 31 Jan 2023 15:31:35 -0500 Subject: [PATCH 22/22] merge fixes --- .../iq/dataverse/export/SchemaDotOrgExporterTest.java | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/test/java/edu/harvard/iq/dataverse/export/SchemaDotOrgExporterTest.java b/src/test/java/edu/harvard/iq/dataverse/export/SchemaDotOrgExporterTest.java index afce0028b5a..e660cf78da2 100644 --- a/src/test/java/edu/harvard/iq/dataverse/export/SchemaDotOrgExporterTest.java +++ b/src/test/java/edu/harvard/iq/dataverse/export/SchemaDotOrgExporterTest.java @@ -75,8 +75,6 @@ public static void tearDownClass() { public void testExportDataset() throws JsonParseException, ParseException, IOException { File datasetVersionJson = new File("src/test/resources/json/dataset-finch2.json"); String datasetVersionAsJson = new String(Files.readAllBytes(Paths.get(datasetVersionJson.getAbsolutePath()))); - License license = new License("CC0 1.0", "You can copy, modify, distribute and perform the work, even for commercial purposes, all without asking permission.", URI.create("http://creativecommons.org/publicdomain/zero/1.0/"), URI.create("/resources/images/cc0.png"), true, 1l); - license.setDefault(true); JsonObject json = JsonUtil.getJsonObject(datasetVersionAsJson); JsonObject json2 = createExportFromJson(json); @@ -99,9 +97,7 @@ public void testExportDataset() throws JsonParseException, ParseException, IOExc assertEquals("1955-11-05", json2.getString("datePublished")); assertEquals("1955-11-05", json2.getString("dateModified")); assertEquals("1", json2.getString("version")); - assertEquals("Darwin's finches (also known as the Galápagos finches) are a group of about fifteen species of passerine birds.", json2.getJsonArray("description").getString(0)); - assertEquals("Bird is the word.", json2.getJsonArray("description").getString(1)); - assertEquals(2, json2.getJsonArray("description").size()); + assertEquals("Darwin's finches (also known as the Galápagos finches) are a group of about fifteen species of passerine birds.\nBird is the word.", json2.getString("description")); assertEquals("Medicine, Health and Life Sciences", json2.getJsonArray("keywords").getString(0)); assertEquals("tcTerm1", json2.getJsonArray("keywords").getString(1)); assertEquals("KeywordTerm1", json2.getJsonArray("keywords").getString(2));