Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve deduplication of publications based on DOI #225

Merged
merged 1 commit into from
Dec 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
using Xunit;
using System.Collections.Generic;
using Xunit;
using System.Collections.Generic;
using api.Services;
using api.Models;
using api.Models.Common;
using api.Models.ProfileEditor;
using api.Models.ProfileEditor.Items;
using api.Models.Ttv;

namespace api.Tests
{
[Collection("Duplicate handler service tests.")]
namespace api.Tests
{
[Collection("Duplicate handler service tests.")]
public class DuplicateHandlerServiceTests_HasSameDoiButIsDifferentPublication
{
[Fact(DisplayName = "Virta and ORCID publication have the same DOI, they are considered as the same publications: Virta publication type code is not A3, A4, B2, B3, D2, D3 or E1")]
Expand Down Expand Up @@ -302,11 +302,11 @@ public void addPublicationToProfileEditorData_020()
DimFieldDisplaySettings_FieldIdentifier = Constants.FieldIdentifiers.ACTIVITY_PUBLICATION
};

// Create ProfileDataRaw for ORCID publication 1. The same DOI and name as in Virta publication.
// Create ProfileDataRaw for ORCID publication 1. The same DOI (in uppercase letters) and name as in Virta publication.
ProfileDataFromSql profileDataOrcid1 = new()
{
DimProfileOnlyPublication_PublicationId = "publicationId456",
DimProfileOnlyPublication_Doi = "doi123",
DimProfileOnlyPublication_Doi = "DOI123",
DimProfileOnlyPublication_PublicationName = "name123",
DimFieldDisplaySettings_FieldIdentifier = Constants.FieldIdentifiers.ACTIVITY_PUBLICATION_PROFILE_ONLY
};
Expand Down Expand Up @@ -367,5 +367,5 @@ public void testPublicationYearHandling()
// Publication year null
Assert.Null(duplicateHandlerService.HandlePublicationYear(null));
}
}
}
}
48 changes: 43 additions & 5 deletions aspnetcore/src/api.Tests/Services_Tests/UserProfileServiceTest.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
using Xunit;
using Xunit;
using api.Services;
using api.Models.Common;
using api.Models.Ttv;
Expand All @@ -7,9 +7,9 @@
using System;
using api.Models.ProfileEditor;

namespace api.Tests
{
[Collection("User profile service tests")]
namespace api.Tests
{
[Collection("User profile service tests")]
public class UserProfileServiceTests
{
[Fact(DisplayName = "Get FieldIdentifiers")]
Expand Down Expand Up @@ -500,5 +500,43 @@ public void ResearchActivityDeduplication()
"Research activities are not duplicates, Sv name differs"
);
}
}

[Fact(DisplayName = "Get profile editor source")]
public void GetProfileEditorSource()
{
// Arrange
LanguageService languageService = new LanguageService();
UserProfileService userProfileService = new(languageService: languageService);
ProfileDataFromSql p = new ()
{
DimRegisteredDataSource_Id = 1234,
DimRegisteredDataSource_Name = "TestRegisteredDataSourceName",
DimRegisteredDataSource_DimOrganization_NameFi = "TestOrganizationNameFi",
DimRegisteredDataSource_DimOrganization_NameEn = "TestOrganizationNameEn",
DimRegisteredDataSource_DimOrganization_NameSv = "TestOrganizationNameSv",
DimRegisteredDataSource_DimOrganization_DimSector_SectorId = "TestSectorId"
};
ProfileEditorSource expectedProfileEditorSource = new ()
{
Id = 1234,
RegisteredDataSource = "TestRegisteredDataSourceName",
Organization = new Organization()
{
NameFi = "TestOrganizationNameFi",
NameEn = "TestOrganizationNameEn",
NameSv = "TestOrganizationNameSv",
SectorId = "TestSectorId"
}
};
// Act
ProfileEditorSource actualProfileEditorSource = userProfileService.GetProfileEditorSource(p);
// Assert
Assert.Equal(expectedProfileEditorSource.Id, actualProfileEditorSource.Id);
Assert.Equal(expectedProfileEditorSource.RegisteredDataSource, actualProfileEditorSource.RegisteredDataSource);
Assert.Equal(expectedProfileEditorSource.Organization.NameFi, actualProfileEditorSource.Organization.NameFi);
Assert.Equal(expectedProfileEditorSource.Organization.NameEn, actualProfileEditorSource.Organization.NameEn);
Assert.Equal(expectedProfileEditorSource.Organization.NameSv, actualProfileEditorSource.Organization.NameSv);
Assert.Equal(expectedProfileEditorSource.Organization.SectorId, actualProfileEditorSource.Organization.SectorId);
}
}
}
4 changes: 2 additions & 2 deletions aspnetcore/src/api/Services/DuplicateHandlerService.cs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@

using System.Collections.Generic;
using System.Collections.Generic;
using api.Models.Common;
using api.Models.ProfileEditor;
using api.Models.ProfileEditor.Items;
Expand Down Expand Up @@ -86,7 +86,7 @@ public List<ProfileEditorPublication> AddPublicationToProfileEditorData(ProfileE
if (
IsOrcidPublication(profileData) &&
profileData.DimProfileOnlyPublication_Doi != "" &&
profileData.DimProfileOnlyPublication_Doi == publication.Doi &&
profileData.DimProfileOnlyPublication_Doi.ToLower() == publication.Doi.ToLower() &&
!HasSameDoiButIsDifferentPublication(profileData.DimProfileOnlyPublication_PublicationName, publication)
)
{
Expand Down
1 change: 1 addition & 0 deletions aspnetcore/src/api/Services/IDuplicateHandlerService.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,6 @@ public interface IDuplicateHandlerService
List<ProfileEditorPublication> AddPublicationToProfileEditorData(ProfileEditorSource dataSource, ProfileDataFromSql profileData, List<ProfileEditorPublication> publications);
bool HasSameDoiButIsDifferentPublication(string orcidPublicationName, ProfileEditorPublication publication);
bool IsOrcidPublication(ProfileDataFromSql profileData);
int? HandlePublicationYear(int? dimDateYear);
}
}
82 changes: 53 additions & 29 deletions aspnetcore/src/api/Services/UserProfileService.cs
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,11 @@ public UserProfileService(IDataSourceHelperService dataSourceHelperService)
_dataSourceHelperService = dataSourceHelperService;
}

public UserProfileService(ILanguageService languageService)
{
_languageService = languageService;
}

/*
* Get FieldIdentifiers.
*/
Expand Down Expand Up @@ -1041,11 +1046,36 @@ await AddTtvDataToUserProfile(
await _ttvContext.SaveChangesAsync();
}

/*
* Create object indincating data source. Used for every profile data item.
*/
public ProfileEditorSource GetProfileEditorSource(ProfileDataFromSql p)
{
// Organization name translation
NameTranslation nameTranslationSourceOrganization = _languageService.GetNameTranslation(
nameFi: p.DimRegisteredDataSource_DimOrganization_NameFi,
nameEn: p.DimRegisteredDataSource_DimOrganization_NameEn,
nameSv: p.DimRegisteredDataSource_DimOrganization_NameSv
);

ProfileEditorSource profileEditorSource = new()
{
Id = p.DimRegisteredDataSource_Id,
RegisteredDataSource = p.DimRegisteredDataSource_Name,
Organization = new Organization()
{
NameFi = nameTranslationSourceOrganization.NameFi,
NameEn = nameTranslationSourceOrganization.NameEn,
NameSv = nameTranslationSourceOrganization.NameSv,
SectorId = p.DimRegisteredDataSource_DimOrganization_DimSector_SectorId
}
};
return profileEditorSource;
}


/*
* Get profile data. New version using data structure,
* where each item contains a list of data sources.
* Get profile data.
*/
public async Task<ProfileEditorDataResponse> GetProfileDataAsync(int userprofileId, LogUserIdentification logUserIdentification, bool forElasticsearch = false)
{
Expand All @@ -1065,28 +1095,13 @@ public async Task<ProfileEditorDataResponse> GetProfileDataAsync(int userprofile
// Helper list, which is used in deduplication of research activities
List<ProfileDataFromSql> profileOnlyResearchActivityRowsToDeduplicate = new();

// Helper lists, which are used in DOI based deduplication of ORCID publications
List<ProfileDataFromSql> profileOnlyPublicationsToDeduplicate = new();

foreach (ProfileDataFromSql p in profileDataList)
{
// Organization name translation
NameTranslation nameTranslationSourceOrganization = _languageService.GetNameTranslation(
nameFi: p.DimRegisteredDataSource_DimOrganization_NameFi,
nameEn: p.DimRegisteredDataSource_DimOrganization_NameEn,
nameSv: p.DimRegisteredDataSource_DimOrganization_NameSv
);

// Source object containing registered data source and organization name.
ProfileEditorSource profileEditorSource = new()
{
Id = p.DimRegisteredDataSource_Id,
RegisteredDataSource = p.DimRegisteredDataSource_Name,
Organization = new Organization()
{
NameFi = nameTranslationSourceOrganization.NameFi,
NameEn = nameTranslationSourceOrganization.NameEn,
NameSv = nameTranslationSourceOrganization.NameSv,
SectorId = p.DimRegisteredDataSource_DimOrganization_DimSector_SectorId
}
};
ProfileEditorSource profileEditorSource = GetProfileEditorSource(p);

// Add data source into list of unique data sources.
if (!uniqueDataSourceIds.Contains(profileEditorSource.Id))
Expand Down Expand Up @@ -1467,7 +1482,7 @@ public async Task<ProfileEditorDataResponse> GetProfileDataAsync(int userprofile
);
break;

// Publication
// Publication (DimPublication)
case Constants.FieldIdentifiers.ACTIVITY_PUBLICATION:
profileDataResponse.activity.publications =
_duplicateHandlerService.AddPublicationToProfileEditorData(
Expand All @@ -1477,14 +1492,10 @@ public async Task<ProfileEditorDataResponse> GetProfileDataAsync(int userprofile
);
break;

// Publication (ORCID)
// Publication (DimProfileOnlyPublication)
// Collect items into a helper list. They will be deduplicated later.
case Constants.FieldIdentifiers.ACTIVITY_PUBLICATION_PROFILE_ONLY:
profileDataResponse.activity.publications =
_duplicateHandlerService.AddPublicationToProfileEditorData(
dataSource: profileEditorSource,
profileData: p,
publications: profileDataResponse.activity.publications
);
profileOnlyPublicationsToDeduplicate.Add(p);
break;

// Research activity
Expand Down Expand Up @@ -1931,6 +1942,19 @@ public async Task<ProfileEditorDataResponse> GetProfileDataAsync(int userprofile
}
}

/*
* ORCID publication deduplication
*/
foreach (ProfileDataFromSql p in profileOnlyPublicationsToDeduplicate)
{
ProfileEditorSource profileEditorSource = GetProfileEditorSource(p);
profileDataResponse.activity.publications = _duplicateHandlerService.AddPublicationToProfileEditorData(
dataSource: profileEditorSource,
profileData: p,
publications: profileDataResponse.activity.publications
);
}

/*
* Research activity deduplication
*
Expand Down