Skip to content

Commit

Permalink
update default similarity to PARTIAL
Browse files Browse the repository at this point in the history
- duplicate if either DOI or AuthorTitle match
    I cannot think any situatiion where two legitimately distinct papers
will have the exact same Authors and titles, or of course same DOI
- tests updated to reflect that, and rewritten for clarity
  • Loading branch information
perrette committed Apr 21, 2023
1 parent 4742508 commit ab60d1e
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 54 deletions.
17 changes: 12 additions & 5 deletions papers/bib.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,9 +97,12 @@ def entry_id(e):

FUZZY_RATIO = 80

# should be conservative (used in papers add)
DEFAULT_SIMILARITY = 'FAIR'
# DEFAULT_SIMILARITY = 'PARTIAL'
# Default similarity is used in papers add
# False positive (to weak a test) and distinct entries will be merged
# False negative and duplicates will be created
# PARTIAL means that If either DOI or author+title agree, that is good enough to be considered a duplicate
# I cant think of any situation where two legitimately distinct papers test True with partial similarity.
DEFAULT_SIMILARITY = 'PARTIAL'

EXACT_DUPLICATES = 104
GOOD_DUPLICATES = 103
Expand All @@ -122,10 +125,14 @@ def compare_entries(e1, e2, fuzzy=False):
if id1 == id2:
score = GOOD_DUPLICATES

elif all([f1==f2 for f1, f2 in zip(id1, id2) if f1 and f2]): # all defined fields agree
elif e1.get('doi') and e2.get('doi') and e1['doi'] == e2['doi']:
score = FAIR_DUPLICATES

elif any([f1==f2 for f1, f2 in zip(id1, id2) if f1 and f2]): # some of the defined fields agree
# elif all([f1==f2 for f1, f2 in zip(id1, id2) if f1 and f2]): # ID and AUTHORTITLE agree
# score = FAIR_DUPLICATES
# COMMENT: same as GOOD_DUPLICATES when all fields are defined, but also returns true when one field is missing in one entry

elif any([f1==f2 for f1, f2 in zip(id1, id2) if f1 and f2]): # any of ID or AUTHORTITLE agree
score = PARTIAL_DUPLICATES

elif not fuzzy:
Expand Down
94 changes: 45 additions & 49 deletions tests/test_papers.py
Original file line number Diff line number Diff line change
Expand Up @@ -346,8 +346,9 @@ def assertMultiLineEqual(self, first, second, msg=None):



class SimilarityBase(unittest.TestCase):

class TestDuplicates(unittest.TestCase):
similarity = None

reference = """@article{Perrette_2011,
author = {M. Perrette and A. Yool and G. D. Quartly and E. E. Popova},
Expand Down Expand Up @@ -397,29 +398,34 @@ class TestDuplicates(unittest.TestCase):
year = {2012}
}"""

@staticmethod
def isduplicate(a, b):

def isduplicate(self, a, b):
"""test Biblio's eq method for duplicates
"""
db = bibtexparser.loads(a+'\n'+b)
e1, e2 = db.entries
refs = Biblio()
refs = Biblio(similarity=self.similarity)
return refs.eq(e1, e2)


class TestDuplicatesExact(SimilarityBase):

similarity = 'EXACT'

def test_exactsame(self):
self.assertTrue(self.isduplicate(self.reference, self.reference))

def test_anotherkey(self):
self.assertTrue(self.isduplicate(self.reference, self.anotherkey))
self.assertFalse(self.isduplicate(self.reference, self.anotherkey))

def test_missingfield(self):
self.assertTrue(self.isduplicate(self.reference, self.missingfield))
self.assertFalse(self.isduplicate(self.reference, self.missingfield))

def test_missingdoi(self):
self.assertTrue(self.isduplicate(self.reference, self.missingdoi))
self.assertFalse(self.isduplicate(self.reference, self.missingdoi))

def test_missingtitauthor(self):
self.assertTrue(self.isduplicate(self.reference, self.missingtitauthor))
self.assertFalse(self.isduplicate(self.reference, self.missingtitauthor))

def test_conflictauthor(self):
self.assertFalse(self.isduplicate(self.reference, self.conflictauthor))
Expand All @@ -428,68 +434,58 @@ def test_conflictdoi(self):
self.assertFalse(self.isduplicate(self.reference, self.conflictdoi))

def test_conflictyear(self):
self.assertTrue(self.isduplicate(self.reference, self.conflictyear))



class SimilarityBase:

similarity = None

def isduplicate(self, a, b):
"""test Biblio's eq method for duplicates
"""
db = bibtexparser.loads(a+'\n'+b)
e1, e2 = db.entries
refs = Biblio(similarity=self.similarity)
return refs.eq(e1, e2)
self.assertFalse(self.isduplicate(self.reference, self.conflictyear))


class TestDuplicatesExact(SimilarityBase, TestDuplicates):
class TestDuplicatesGood(TestDuplicatesExact):

similarity = 'EXACT'
similarity = 'GOOD'

def test_anotherkey(self):
self.assertFalse(self.isduplicate(self.reference, self.anotherkey))
self.assertTrue(self.isduplicate(self.reference, self.anotherkey))

def test_missingfield(self):
self.assertFalse(self.isduplicate(self.reference, self.missingfield))

def test_missingdoi(self):
self.assertFalse(self.isduplicate(self.reference, self.missingdoi))

def test_missingtitauthor(self):
self.assertFalse(self.isduplicate(self.reference, self.missingtitauthor))
self.assertTrue(self.isduplicate(self.reference, self.missingfield))

def test_conflictyear(self):
self.assertFalse(self.isduplicate(self.reference, self.conflictyear))


self.assertTrue(self.isduplicate(self.reference, self.conflictyear))

class TestDuplicatesGood(SimilarityBase, TestDuplicates):

similarity = 'GOOD'
class TestDuplicatesFair(TestDuplicatesGood):

def test_missingdoi(self):
self.assertFalse(self.isduplicate(self.reference, self.missingdoi))
similarity = 'FAIR'

def test_missingtitauthor(self):
self.assertFalse(self.isduplicate(self.reference, self.missingtitauthor))
self.assertTrue(self.isduplicate(self.reference, self.missingtitauthor))

def test_conflictauthor(self):
self.assertTrue(self.isduplicate(self.reference, self.conflictauthor))


class TestDuplicatesPartial(SimilarityBase, TestDuplicates):
class TestDuplicatesPartial(TestDuplicatesFair):

similarity = 'PARTIAL'

def test_conflictauthor(self):
self.assertTrue(self.isduplicate(self.reference, self.conflictauthor))
def test_missingdoi(self):
self.assertTrue(self.isduplicate(self.reference, self.missingdoi))

def test_conflictdoi(self):
self.assertTrue(self.isduplicate(self.reference, self.conflictdoi))


class TestDuplicates(TestDuplicatesPartial):

class TestDuplicatesAdd(SimilarityBase, TestDuplicates):
@staticmethod
def isduplicate(a, b):
"""test Biblio's eq method for duplicates
"""
db = bibtexparser.loads(a+'\n'+b)
e1, e2 = db.entries
refs = Biblio()
return refs.eq(e1, e2)


class TestDuplicatesAdd(TestDuplicates):

def setUp(self):
self.mybib = tempfile.mktemp(prefix='papers.bib')
Expand Down Expand Up @@ -760,15 +756,15 @@ class TestAddConflict(BibTest):
}"""

bibtex_conflict_key = """@article{Perrette_2011,
author = {M. Perrette and A. Yool and G. D. Quartly and E. E. Popova},
author = {M. Perrette and Another author},
doi = {10.5194/bg-8-515-2011XXX},
title = {Near-ubiquity of ice-edge blooms in the Arctic}
title = {Something else entirely}
}"""

bibtex_conflict_key_fixed = """@article{Perrette_2011b,
author = {M. Perrette and A. Yool and G. D. Quartly and E. E. Popova},
author = {M. Perrette and Another author},
doi = {10.5194/bg-8-515-2011XXX},
title = {Near-ubiquity of ice-edge blooms in the Arctic}
title = {Something else entirely}
}"""

bibtex_same_doi = """@article{same_doi,
Expand Down

0 comments on commit ab60d1e

Please sign in to comment.