Skip to content

Commit

Permalink
Improve deduplication of publications based on DOI
Browse files Browse the repository at this point in the history
  • Loading branch information
sarkikos committed Dec 15, 2023
1 parent ffb502a commit 4fc7824
Show file tree
Hide file tree
Showing 5 changed files with 107 additions and 44 deletions.
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
using Xunit;
using System.Collections.Generic;
using Xunit;
using System.Collections.Generic;
using api.Services;
using api.Models;
using api.Models.Common;
using api.Models.ProfileEditor;
using api.Models.ProfileEditor.Items;
using api.Models.Ttv;

namespace api.Tests
{
[Collection("Duplicate handler service tests.")]
namespace api.Tests
{
[Collection("Duplicate handler service tests.")]
public class DuplicateHandlerServiceTests_HasSameDoiButIsDifferentPublication
{
[Fact(DisplayName = "Virta and ORCID publication have the same DOI, they are considered as the same publications: Virta publication type code is not A3, A4, B2, B3, D2, D3 or E1")]
Expand Down Expand Up @@ -302,11 +302,11 @@ public void addPublicationToProfileEditorData_020()
DimFieldDisplaySettings_FieldIdentifier = Constants.FieldIdentifiers.ACTIVITY_PUBLICATION
};

// Create ProfileDataRaw for ORCID publication 1. The same DOI and name as in Virta publication.
// Create ProfileDataRaw for ORCID publication 1. The same DOI (in uppercase letters) and name as in Virta publication.
ProfileDataFromSql profileDataOrcid1 = new()
{
DimProfileOnlyPublication_PublicationId = "publicationId456",
DimProfileOnlyPublication_Doi = "doi123",
DimProfileOnlyPublication_Doi = "DOI123",
DimProfileOnlyPublication_PublicationName = "name123",
DimFieldDisplaySettings_FieldIdentifier = Constants.FieldIdentifiers.ACTIVITY_PUBLICATION_PROFILE_ONLY
};
Expand Down Expand Up @@ -367,5 +367,5 @@ public void testPublicationYearHandling()
// Publication year null
Assert.Null(duplicateHandlerService.HandlePublicationYear(null));
}
}
}
}
48 changes: 43 additions & 5 deletions aspnetcore/src/api.Tests/Services_Tests/UserProfileServiceTest.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
using Xunit;
using Xunit;
using api.Services;
using api.Models.Common;
using api.Models.Ttv;
Expand All @@ -7,9 +7,9 @@
using System;
using api.Models.ProfileEditor;

namespace api.Tests
{
[Collection("User profile service tests")]
namespace api.Tests
{
[Collection("User profile service tests")]
public class UserProfileServiceTests
{
[Fact(DisplayName = "Get FieldIdentifiers")]
Expand Down Expand Up @@ -500,5 +500,43 @@ public void ResearchActivityDeduplication()
"Research activities are not duplicates, Sv name differs"
);
}
}

[Fact(DisplayName = "Get profile editor source")]
public void GetProfileEditorSource()
{
// Arrange
LanguageService languageService = new LanguageService();
UserProfileService userProfileService = new(languageService: languageService);
ProfileDataFromSql p = new ()
{
DimRegisteredDataSource_Id = 1234,
DimRegisteredDataSource_Name = "TestRegisteredDataSourceName",
DimRegisteredDataSource_DimOrganization_NameFi = "TestOrganizationNameFi",
DimRegisteredDataSource_DimOrganization_NameEn = "TestOrganizationNameEn",
DimRegisteredDataSource_DimOrganization_NameSv = "TestOrganizationNameSv",
DimRegisteredDataSource_DimOrganization_DimSector_SectorId = "TestSectorId"
};
ProfileEditorSource expectedProfileEditorSource = new ()
{
Id = 1234,
RegisteredDataSource = "TestRegisteredDataSourceName",
Organization = new Organization()
{
NameFi = "TestOrganizationNameFi",
NameEn = "TestOrganizationNameEn",
NameSv = "TestOrganizationNameSv",
SectorId = "TestSectorId"
}
};
// Act
ProfileEditorSource actualProfileEditorSource = userProfileService.GetProfileEditorSource(p);
// Assert
Assert.Equal(expectedProfileEditorSource.Id, actualProfileEditorSource.Id);
Assert.Equal(expectedProfileEditorSource.RegisteredDataSource, actualProfileEditorSource.RegisteredDataSource);
Assert.Equal(expectedProfileEditorSource.Organization.NameFi, actualProfileEditorSource.Organization.NameFi);
Assert.Equal(expectedProfileEditorSource.Organization.NameEn, actualProfileEditorSource.Organization.NameEn);
Assert.Equal(expectedProfileEditorSource.Organization.NameSv, actualProfileEditorSource.Organization.NameSv);
Assert.Equal(expectedProfileEditorSource.Organization.SectorId, actualProfileEditorSource.Organization.SectorId);
}
}
}
4 changes: 2 additions & 2 deletions aspnetcore/src/api/Services/DuplicateHandlerService.cs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@

using System.Collections.Generic;
using System.Collections.Generic;
using api.Models.Common;
using api.Models.ProfileEditor;
using api.Models.ProfileEditor.Items;
Expand Down Expand Up @@ -86,7 +86,7 @@ public List<ProfileEditorPublication> AddPublicationToProfileEditorData(ProfileE
if (
IsOrcidPublication(profileData) &&
profileData.DimProfileOnlyPublication_Doi != "" &&
profileData.DimProfileOnlyPublication_Doi == publication.Doi &&
profileData.DimProfileOnlyPublication_Doi.ToLower() == publication.Doi.ToLower() &&
!HasSameDoiButIsDifferentPublication(profileData.DimProfileOnlyPublication_PublicationName, publication)
)
{
Expand Down
1 change: 1 addition & 0 deletions aspnetcore/src/api/Services/IDuplicateHandlerService.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,6 @@ public interface IDuplicateHandlerService
List<ProfileEditorPublication> AddPublicationToProfileEditorData(ProfileEditorSource dataSource, ProfileDataFromSql profileData, List<ProfileEditorPublication> publications);
bool HasSameDoiButIsDifferentPublication(string orcidPublicationName, ProfileEditorPublication publication);
bool IsOrcidPublication(ProfileDataFromSql profileData);
int? HandlePublicationYear(int? dimDateYear);
}
}
82 changes: 53 additions & 29 deletions aspnetcore/src/api/Services/UserProfileService.cs
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,11 @@ public UserProfileService(IDataSourceHelperService dataSourceHelperService)
_dataSourceHelperService = dataSourceHelperService;
}

public UserProfileService(ILanguageService languageService)
{
_languageService = languageService;
}

/*
* Get FieldIdentifiers.
*/
Expand Down Expand Up @@ -1041,11 +1046,36 @@ await AddTtvDataToUserProfile(
await _ttvContext.SaveChangesAsync();
}

/*
* Create object indincating data source. Used for every profile data item.
*/
public ProfileEditorSource GetProfileEditorSource(ProfileDataFromSql p)
{
// Organization name translation
NameTranslation nameTranslationSourceOrganization = _languageService.GetNameTranslation(
nameFi: p.DimRegisteredDataSource_DimOrganization_NameFi,
nameEn: p.DimRegisteredDataSource_DimOrganization_NameEn,
nameSv: p.DimRegisteredDataSource_DimOrganization_NameSv
);

ProfileEditorSource profileEditorSource = new()
{
Id = p.DimRegisteredDataSource_Id,
RegisteredDataSource = p.DimRegisteredDataSource_Name,
Organization = new Organization()
{
NameFi = nameTranslationSourceOrganization.NameFi,
NameEn = nameTranslationSourceOrganization.NameEn,
NameSv = nameTranslationSourceOrganization.NameSv,
SectorId = p.DimRegisteredDataSource_DimOrganization_DimSector_SectorId
}
};
return profileEditorSource;
}


/*
* Get profile data. New version using data structure,
* where each item contains a list of data sources.
* Get profile data.
*/
public async Task<ProfileEditorDataResponse> GetProfileDataAsync(int userprofileId, LogUserIdentification logUserIdentification, bool forElasticsearch = false)
{
Expand All @@ -1065,28 +1095,13 @@ public async Task<ProfileEditorDataResponse> GetProfileDataAsync(int userprofile
// Helper list, which is used in deduplication of research activities
List<ProfileDataFromSql> profileOnlyResearchActivityRowsToDeduplicate = new();

// Helper lists, which are used in DOI based deduplication of ORCID publications
List<ProfileDataFromSql> profileOnlyPublicationsToDeduplicate = new();

foreach (ProfileDataFromSql p in profileDataList)
{
// Organization name translation
NameTranslation nameTranslationSourceOrganization = _languageService.GetNameTranslation(
nameFi: p.DimRegisteredDataSource_DimOrganization_NameFi,
nameEn: p.DimRegisteredDataSource_DimOrganization_NameEn,
nameSv: p.DimRegisteredDataSource_DimOrganization_NameSv
);

// Source object containing registered data source and organization name.
ProfileEditorSource profileEditorSource = new()
{
Id = p.DimRegisteredDataSource_Id,
RegisteredDataSource = p.DimRegisteredDataSource_Name,
Organization = new Organization()
{
NameFi = nameTranslationSourceOrganization.NameFi,
NameEn = nameTranslationSourceOrganization.NameEn,
NameSv = nameTranslationSourceOrganization.NameSv,
SectorId = p.DimRegisteredDataSource_DimOrganization_DimSector_SectorId
}
};
ProfileEditorSource profileEditorSource = GetProfileEditorSource(p);

// Add data source into list of unique data sources.
if (!uniqueDataSourceIds.Contains(profileEditorSource.Id))
Expand Down Expand Up @@ -1467,7 +1482,7 @@ public async Task<ProfileEditorDataResponse> GetProfileDataAsync(int userprofile
);
break;

// Publication
// Publication (DimPublication)
case Constants.FieldIdentifiers.ACTIVITY_PUBLICATION:
profileDataResponse.activity.publications =
_duplicateHandlerService.AddPublicationToProfileEditorData(
Expand All @@ -1477,14 +1492,10 @@ public async Task<ProfileEditorDataResponse> GetProfileDataAsync(int userprofile
);
break;

// Publication (ORCID)
// Publication (DimProfileOnlyPublication)
// Collect items into a helper list. They will be deduplicated later.
case Constants.FieldIdentifiers.ACTIVITY_PUBLICATION_PROFILE_ONLY:
profileDataResponse.activity.publications =
_duplicateHandlerService.AddPublicationToProfileEditorData(
dataSource: profileEditorSource,
profileData: p,
publications: profileDataResponse.activity.publications
);
profileOnlyPublicationsToDeduplicate.Add(p);
break;

// Research activity
Expand Down Expand Up @@ -1931,6 +1942,19 @@ public async Task<ProfileEditorDataResponse> GetProfileDataAsync(int userprofile
}
}

/*
* ORCID publication deduplication
*/
foreach (ProfileDataFromSql p in profileOnlyPublicationsToDeduplicate)
{
ProfileEditorSource profileEditorSource = GetProfileEditorSource(p);
profileDataResponse.activity.publications = _duplicateHandlerService.AddPublicationToProfileEditorData(
dataSource: profileEditorSource,
profileData: p,
publications: profileDataResponse.activity.publications
);
}

/*
* Research activity deduplication
*
Expand Down

0 comments on commit 4fc7824

Please sign in to comment.