This repository has been archived by the owner on Apr 16, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 2
/
viaf.py
946 lines (775 loc) · 39.9 KB
/
viaf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
import logging
import os
import re
from lxml import etree
from fuzzywuzzy import fuzz
from collections import Counter, defaultdict
from core.metadata_layer import (
ContributorData,
Metadata,
)
from core.model import (
Contributor,
DataSource,
Representation,
)
from core.util.personal_names import (
contributor_name_match_ratio,
display_name_to_sort_name,
is_corporate_name,
normalize_contributor_name_for_matching,
)
from core.util.titles import (
normalize_title_for_matching,
title_match_ratio,
unfluff_title,
)
from core.util.xmlparser import (
XMLParser,
)
class NameParser(object):
"""Parse VIAF-style personal names.
These are used by VIAF but also (in slightly modified form) by
OCLC Classify.
TODO: VIAFParser has its own "name parsing" code which focuses on
extractingdata from the XML generated by the VIAF API. These code
bases need to be merged.
"""
LIFESPAN = re.compile("([0-9]*)\??-([0-9]*)[?.;]*$")
@classmethod
def parse(cls, string):
"""Parse a string into a ContributorData object.
This may include sort_name, birth_date, and death_date.
"""
string = string.strip()
sort_name, birth, death = cls._get_lifespan(string)
extra=dict()
if birth is not None:
extra[Contributor.BIRTH_DATE]=birth
if death is not None:
extra[Contributor.DEATH_DATE]=death
return ContributorData(
sort_name=sort_name,
extra=extra,
)
@classmethod
def _get_lifespan(cls, author):
"""Extract a possible lifespan from an author string.
:return: A 3-tuple (name_without_lifespan, birth, death)
"""
birth = None
death = None
match = cls.LIFESPAN.search(author)
if match:
# "Baxter, Charles, 1947-" -> "Baxter, Charles, "
name_without_lifespan = author[:match.start()].strip()
birth, death = match.groups()
if birth == '':
birth = None
if death == '':
death = None
else:
name_without_lifespan = author
# "Baxter, Charles, " -> "Baxter, Charles"
name_without_lifespan = name_without_lifespan.strip()
if name_without_lifespan.endswith(","):
name_without_lifespan = name_without_lifespan[:-1]
return name_without_lifespan.strip(), birth, death
class VIAFParser(XMLParser):
NAMESPACES = {'ns2' : "http://viaf.org/viaf/terms#"}
log = logging.getLogger("VIAF Parser")
wikidata_id = re.compile("^Q[0-9]")
@classmethod
def combine_nameparts(self, given, family, extra):
"""Turn a (given name, family name, extra) 3-tuple into a
display name.
"""
if not given and not family:
return None
if family and not given:
display_name = family
elif given and not family:
display_name = given
else:
display_name = given + ' ' + family
if extra and not extra.startswith('pseud'):
if family and given:
display_name += ', ' + extra
else:
display_name += ' ' + extra
return display_name
@classmethod
def name_matches(cls, n1, n2):
""" Returns true if n1 and n2 are identical strings, bar periods.
"""
return n1.replace(".", "").lower() == n2.replace(".", "").lower()
@classmethod
def prepare_contributor_name_for_matching(cls, name):
"""
Normalize the special characters and inappropriate spacings away.
Put the name into title, first, middle, last, suffix, nickname order,
and lowercase.
"""
return normalize_contributor_name_for_matching(name)
@classmethod
def weigh_contributor(cls, candidate, working_sort_name, known_titles=None, strict=False, ignore_popularity=False):
""" Find the author who corresponds the best to the working_sort_name.
Consider as evidence of suitability:
- top-most in viaf-returned xml (most popular in libraries)
- various name/pseudonym fields within xml match
- has written titles that match ones passed in.
Actual weight numbers do not matter, only their weights relative to each other.
So, if the total match confidence is 110%, that's acceptable, and may not even
be the best match if there's a 120% out there. But having an exact title match
does matter more than a fuzzy unimarc tag match.
"""
report_string = "no_viaf"
(contributor, match_confidences, contributor_titles) = candidate
if contributor.viaf:
report_string = "viaf=%s" % contributor.viaf
if not match_confidences:
# we didn't get it from the xml, but we'll add to it now
match_confidences = {}
# If we're not sure that this is even the right cluster for
# the given author, make sure that one of the working names
# shows up in a name record.
if strict:
if not match_confidences:
return 0
# Assign weights to fields matched in the xml.
# The fuzzy matching returned a number between 0 and 100,
# now tell the system that we find sort_name to be a more reliable indicator
# than unimarc flags.
# Weights are cumulative -- if both the sort and display name match, that helps us
# be extra special sure. But what to do if unimarc tags match and sort_name doesn't?
# Here's where the strict tag comes in. With strict, a failed sort_name match says "no"
# to any other suggestions of a possible fit.
match_confidences["total"] = 0
if "library_popularity" in match_confidences and not ignore_popularity:
match_confidences["total"] += -10 * match_confidences["library_popularity"]
report_string += ", pop=10 * %s" % match_confidences["library_popularity"]
if "sort_name" in match_confidences:
# fuzzy match filter may not always give a 100% match, so cap arbitrarily at 90% as a "sure match"
if strict and match_confidences["sort_name"] < 90:
match_confidences["total"] = 0
report_string += ", strict and no sort_name match, return 0 (%s)" % match_confidences["sort_name"]
return 0
match_confidences["total"] += 2 * match_confidences["sort_name"]
report_string += ", mc[sort_name]= %s" % match_confidences["sort_name"]
if "display_name" in match_confidences:
match_confidences["total"] += 0.5 * match_confidences["display_name"]
report_string += ", mc[display_name]=%s" % match_confidences["display_name"]
if "unimarc" in match_confidences:
match_confidences["total"] += 0.3 * match_confidences["unimarc"]
report_string += ", mc[unimarc]=%s" % match_confidences["unimarc"]
if "guessed_sort_name" in match_confidences:
match_confidences["total"] += 0.5 * match_confidences["guessed_sort_name"]
report_string += ", mc[guessed_sort_name]=%s" % match_confidences["guessed_sort_name"]
if "alternate_name" in match_confidences:
match_confidences["total"] += 0.2 * match_confidences["alternate_name"]
report_string += ", mc[alternate_name]=%s" % match_confidences["alternate_name"]
# Add in some data quality evidence. We want the contributor to have recognizable
# data to work with.
if contributor.display_name:
match_confidences["total"] += 0.2
report_string += ", have contributor.display_name=%s" % contributor.display_name
if contributor.viaf:
match_confidences["total"] += 0.2
cls.weigh_titles(known_titles, contributor_titles, match_confidences, strict)
if "title" in match_confidences:
report_string += ", mc[title]=%s" % match_confidences["title"]
report_string += ", mc[total]= %s" % match_confidences["total"]
cls.log.debug("weigh_contributor found: " + report_string)
# TODO: in the calling code, create a cloud of interrelated contributors
# around the primary picked on, with relevancy weights given by this.
return match_confidences["total"]
@classmethod
def weigh_titles(cls, known_titles=None, contributor_titles=None, match_confidences=None, strict=False):
if known_titles:
for known_title in known_titles:
if strict:
if known_title in contributor_titles:
match_confidences["title"] = 100
match_confidences["total"] += 0.8 * match_confidences["title"]
# once we find one matching title, no need to keep looking
break
else:
for contributor_title in contributor_titles:
# when the second half of the title has something like:
# "Edited by", a colon or semicolon, a bracket or parentheses, a hyphen,
# one of the institutional authors, like Disney Book Group, elibrary, Inc,
# Harvard University, Harper & Brothers,
# then see if can get an exact substring match on the title.
# We want to accept "Pride and Prejudice (Unabridged)" as equivalent to
# "Pride and Prejudice", but reject "Pride and Prejudice and Zombies"
# as probably not written by Jane Austen.
# TODO: In future, consider doing:
# "Pride and Prejudice (Spanish)" should connect to two authors --
# Jane Austen and the translator.
if cls.name_matches(unfluff_title(contributor_title), unfluff_title(known_title)):
match_confidences["title"] = 90
match_confidences["total"] += 0.8 * match_confidences["title"]
# match is good enough, we can stop
break
'''
Fixes issue where
<ns1:title>Britain, detente and changing east-west relations</ns1:title> (with accented e in detente)
doesn't match "Britain, Detente and Changing East-West Relations" in our DB.
'''
match_confidence = title_match_ratio(known_title, contributor_title)
match_confidences["title"] = match_confidence
if match_confidence > 80:
match_confidences["total"] += 0.6 * match_confidence
# match is good enough, we can stop
break
def alternate_name_forms_for_cluster(self, cluster):
"""Find all pseudonyms in the given cluster."""
for tag in ('400', '700'):
for data_field in self._xpath(
cluster, './/*[local-name()="datafield"][@dtype="MARC21"][@tag="%s"]' % tag):
for potential_match in self._xpath(
data_field, '*[local-name()="subfield"][@code="a"]'):
yield potential_match.text
def sort_names_for_cluster(self, cluster):
"""Find all sort names for the given cluster."""
for tag in ('100', '110'):
for data_field in self._xpath(
cluster, './/*[local-name()="datafield"][@dtype="MARC21"][@tag="%s"]' % tag):
for potential_match in self._xpath(
data_field, '*[local-name()="subfield"][@code="a"]'):
yield potential_match.text
def name_titles_for_cluster(self, cluster):
"""Find all sort names for the given cluster."""
for tag in ('100', '110'):
for data_field in self._xpath(
cluster, './/*[local-name()="datafield"][@dtype="MARC21"][@tag="%s"]' % tag):
for potential_match in self._xpath(
data_field, '*[local-name()="subfield"][@code="c"]'):
yield potential_match.text
def cluster_has_record_for_named_author(
self, cluster, working_sort_name, working_display_name, contributor_data=None):
""" Looks through the xml cluster for all fields that could indicate the
author's name.
Don't short-circuit the xml parsing process -- if found an author name
match, keep parsing and see what else can find.
:return: a dictionary containing description of xml field
that matched author name searched for.
"""
match_confidences = {}
if not contributor_data:
contributor_data = ContributorData()
# If we have a sort name to look for, and it's in this cluster's
# sort names, great.
if working_sort_name:
for potential_match in self.sort_names_for_cluster(cluster):
match_confidence = contributor_name_match_ratio(potential_match, working_sort_name)
match_confidences["sort_name"] = match_confidence
# fuzzy match filter may not always give a 100% match, so cap arbitrarily at 90% as a "sure match"
if match_confidence > 90:
contributor_data.sort_name=potential_match
return match_confidences
# If we have a display name to look for, and this cluster's
# Wikipedia name converts to the display name, great.
if working_display_name:
wikipedia_name = self.extract_wikipedia_name(cluster)
if wikipedia_name:
contributor_data.wikipedia_name=wikipedia_name
display_name = self.wikipedia_name_to_display_name(wikipedia_name)
match_confidence = contributor_name_match_ratio(display_name, working_display_name)
match_confidences["display_name"] = match_confidence
if match_confidence > 90:
contributor_data.display_name=display_name
return match_confidences
# If there are UNIMARC records, and every part of the UNIMARC
# record matches the sort name or the display name, great.
unimarcs = self._xpath(cluster, './/*[local-name()="datafield"][@dtype="UNIMARC"]')
candidates = []
for unimarc in unimarcs:
(possible_given, possible_family,
possible_extra, possible_sort_name) = self.extract_name_from_unimarc(unimarc)
if working_sort_name:
match_confidence = contributor_name_match_ratio(possible_sort_name, working_sort_name)
match_confidences["unimarc"] = match_confidence
if match_confidence > 90:
contributor_data.family_name=possible_sort_name
return match_confidences
for name in (working_sort_name, working_display_name):
if not name:
continue
if (possible_given and possible_given in name
and possible_family and possible_family in name and (
not possible_extra or possible_extra in name)):
match_confidences["unimarc"] = 90
contributor_data.family_name=possible_family
return match_confidences
# Last-ditch effort. Guess at the sort name and see if *that's* one
# of the cluster sort names.
if working_display_name and not working_sort_name:
test_sort_name = display_name_to_sort_name(working_display_name)
for potential_match in self.sort_names_for_cluster(cluster):
match_confidence = contributor_name_match_ratio(potential_match, test_sort_name)
match_confidences["guessed_sort_name"] = match_confidence
if match_confidence > 90:
contributor_data.sort_name=potential_match
return match_confidences
# OK, last last-ditch effort. See if the alternate name forms (pseudonyms) are it.
if working_sort_name:
for potential_match in self.alternate_name_forms_for_cluster(cluster):
match_confidence = contributor_name_match_ratio(potential_match, working_sort_name)
match_confidences["alternate_name"] = match_confidence
if match_confidence > 90:
contributor_data.family_name=potential_match
return match_confidences
return match_confidences
def order_candidates(self, contributor_candidates, working_sort_name,
known_titles=None, strict=False):
"""
Accepts a list of tuples, each tuple containing:
- a ContributorData object filled with VIAF id, display, sort, family,
and wikipedia names, or None on error.
- a list of work titles ascribed to this Contributor.
For each contributor, determines how likely that contributor is to
be the one being searched for (how well they correspond to the
working_sort_name and known_title.
Assumes the contributor_candidates list was generated off an xml
that was is in popularity order. I.e., the author id that
appears in most libraries when searching for working_sort_name is on top.
Assumes the xml's order is preserved in the contributor_candidates list.
:return: the list of tuples, ordered by percent match, in descending order
(top match first).
"""
if not contributor_candidates:
return contributor_candidates
# Double-check that the candidate list is ordered by library
# popularity, as it came from viaf
contributor_candidates.sort(key=lambda c: c[1].get('library_popularity'))
# Grab the most popular candidate.
(contributor_data, match_confidences, contributor_titles) = contributor_candidates[0]
# If the top library popularity candidate is a really bad name
# match, then don't penalize the bottom popularity candidates
# for being on the bottom.
ignore_popularity = False
if match_confidences.get("library_popularity") == 1:
if ("sort_name" in match_confidences and
match_confidences["sort_name"] < 50):
# baaad match
ignore_popularity = True
if ("guessed_sort_name" in match_confidences and
match_confidences["guessed_sort_name"] < 50):
ignore_popularity = True
if (("sort_name" not in match_confidences) and
("guessed_sort_name" not in match_confidences)):
ignore_popularity = True
# higher score for better match, so to have best match first, do desc order.
contributor_candidates.sort(
key=lambda x: self.weigh_contributor(
x, working_sort_name=working_sort_name,
known_titles=known_titles, strict=strict,
ignore_popularity=ignore_popularity
),
reverse=True
)
return contributor_candidates
def parse_multiple(
self, xml, working_sort_name=None, working_display_name=None, page=1):
""" Parse a VIAF response containing multiple clusters into
contributors and titles.
working_sort_name and working_display_name pertain to the author name string that
we're trying to match in the xml list of clusters.
page refers to pagination -- we can get 10 clusters at a time from VIAF,
so an author's name that matches 15 contributors in VIAF search, will need
2 pages (2 queries going out to VIAF).
NOTE: No longer performs quality judgements on whether the contributor found is good enough.
:return: a list of tuples, each tuple containing:
- a ContributorData object filled with VIAF id, display, sort, family,
and wikipedia names, or None on error.
- a dictionary of viaf cluster properties, with weights assigned to each based on how
well the item in the viaf cluster matches the search parameters passed.
- a list of work titles ascribed to this Contributor.
"""
# TODO: decide: handle timeouts gracefully here, or keep throwing exception?
if not xml:
return []
tree = etree.fromstring(xml, parser=etree.XMLParser(recover=True))
# NOTE: we can get the total number of clusters that a viaf search could return with:
# numberOfRecords_tag = self._xpath1(tree, './/*[local-name()="numberOfRecords"]')
# but it's cleaner to call parse 50 times and quit when it's done than pass around record limits.
# each contributor_candidate entry contains 3 objects:
# a contributor_data, a dictionary of search match confidence weights,
# and a list of metadata objects representing authored titles.
contributor_candidates = []
for cluster in self._xpath(tree, '//*[local-name()="VIAFCluster"]'):
contributor_data, match_confidences, contributor_titles = self.extract_viaf_info(
cluster, working_sort_name, working_display_name)
if not contributor_data:
continue
# assume we asked for viaf feed, sorted with sortKeys=holdingscount
match_confidences["library_popularity"] = (len(contributor_candidates)+1) + 10 * (page-1)
if contributor_data.display_name or contributor_data.viaf:
contributor_candidate = (contributor_data, match_confidences, contributor_titles)
contributor_candidates.append(contributor_candidate)
# We could not find any names or viaf ids for this author.
return contributor_candidates
def parse(self, xml, working_sort_name=None, working_display_name=None):
""" Parse a VIAF response containing a single cluster.
NOTE: No longer performs quality judgements on whether the contributor found is good enough.
:return: a ContributorData object filled with display, sort, family,
and wikipedia names, and a list of titles this author has written.
Return None on error.
"""
tree = etree.fromstring(xml, parser=etree.XMLParser(recover=True))
return self.extract_viaf_info(
tree, working_sort_name, working_display_name
)
def extract_wikipedia_name(self, cluster):
"""Extract Wiki name from a single VIAF cluster."""
for source in self._xpath(cluster, './/*[local-name()="sources"]/*[local-name()="source"]'):
if source.text.startswith("WKP|"):
# This could be a Wikipedia page, which is great,or it
# could be a Wikidata ID, which we don't want.
potential_wikipedia = source.text[4:]
if not self.wikidata_id.search(potential_wikipedia):
return potential_wikipedia
def sort_names_by_popularity(self, cluster):
sort_name_popularity = Counter()
for possible_sort_name in self.sort_names_for_cluster(cluster):
if possible_sort_name.endswith(","):
possible_sort_name = possible_sort_name[:-1]
sort_name_popularity[possible_sort_name] += 1
return sort_name_popularity
def extract_viaf_info(self, cluster, working_sort_name=None,
working_display_name=False):
""" Extract name info from a single VIAF cluster.
:return: a tuple containing:
- ContributorData object filled with display, sort, family, and wikipedia names.
- dictionary of ways the xml cluster data matched the names searched for.
- list of titles attributed to the contributor in the cluster.
or Nones on error.
"""
contributor_data = ContributorData()
contributor_titles = []
match_confidences = {}
# Find out if one of the working names shows up in a name record.
# Note: Potentially sets contributor_data.sort_name.
match_confidences = self.cluster_has_record_for_named_author(
cluster, working_sort_name, working_display_name,
contributor_data
)
# Get the VIAF ID for this cluster, just in case we don't have one yet.
viaf_tag = self._xpath1(cluster, './/*[local-name()="viafID"]')
if viaf_tag is None:
contributor_data.viaf = None
else:
contributor_data.viaf = viaf_tag.text
# If we don't have a working sort name, find the most popular
# sort name in this cluster and use it as the sort name.
sort_name_popularity = self.sort_names_by_popularity(cluster)
# Does this cluster have a Wikipedia page?
contributor_data.wikipedia_name = self.extract_wikipedia_name(cluster)
if contributor_data.wikipedia_name:
contributor_data.display_name = self.wikipedia_name_to_display_name(contributor_data.wikipedia_name)
working_display_name = contributor_data.display_name
# TODO: There's a problem here when someone's record has a
# Wikipedia page other than their personal page (e.g. for
# a band they're in.)
known_name = working_sort_name or working_display_name
unimarcs = self._xpath(cluster, './/*[local-name()="datafield"][@dtype="UNIMARC"]')
candidates = []
for unimarc in unimarcs:
(possible_given, possible_family,
possible_extra, possible_sort_name) = self.extract_name_from_unimarc(unimarc)
# Some part of this name must also show up in the original
# name for it to even be considered. Otherwise it's a
# better bet to try to munge the original name.
for v in (possible_given, possible_family, possible_extra):
if not v:
continue
if not known_name or v in known_name:
self.log.debug(
"FOUND %s in %s", v, known_name
)
candidates.append((possible_given, possible_family, possible_extra))
if possible_sort_name:
if possible_sort_name.endswith(","):
possible_sort_name = possible_sort_name[:-1]
sort_name_popularity[possible_sort_name] += 1
break
else:
self.log.debug(
"EXCLUDED %s/%s/%s for lack of resemblance to %s",
possible_given, possible_family, possible_extra,
known_name
)
pass
if sort_name_popularity and not contributor_data.sort_name:
contributor_data.sort_name, ignore = sort_name_popularity.most_common(1)[0]
if contributor_data.display_name:
parts = contributor_data.display_name.split(" ")
if len(parts) == 2:
# Pretty clearly given name+family name.
# If it gets more complicated than this we can't
# be confident.
candidates.append(parts + [None])
display_nameparts = self.best_choice(candidates)
if display_nameparts[1]: # Family name
contributor_data.family_name = display_nameparts[1]
contributor_data.display_name = contributor_data.display_name or self.combine_nameparts(*display_nameparts) or working_display_name
# Now go through the title elements, and make a list.
titles = self._xpath(cluster, './/*[local-name()="titles"]/*[local-name()="work"]/*[local-name()="title"]')
for title in titles:
contributor_titles.append(title.text)
return contributor_data, match_confidences, contributor_titles
def wikipedia_name_to_display_name(self, wikipedia_name):
""" Convert 'Bob_Jones_(Author)' to 'Bob Jones'. """
display_name = wikipedia_name.replace("_", " ")
if ' (' in display_name:
display_name = display_name[:display_name.rindex(' (')]
return display_name
def best_choice(self, possibilities):
"""Return the best (~most popular) choice among the given names.
:param possibilities: A list of (given, family, extra) 3-tuples.
"""
if not possibilities:
return None, None, None
elif len(possibilities) == 1:
# There is only one choice. Use it.
return possibilities[0]
# There's more than one choice, so it's gonna get
# complicated. First, find the most common family name.
family_names = Counter()
given_name_for_family_name = defaultdict(Counter)
extra_for_given_name_and_family_name = defaultdict(Counter)
for given_name, family_name, name_extra in possibilities:
self.log.debug(
"POSSIBILITY: %s/%s/%s",
given_name, family_name, name_extra
)
if family_name:
family_names[family_name] += 1
if given_name:
given_name_for_family_name[family_name][given_name] += 1
extra_for_given_name_and_family_name[(family_name, given_name)][name_extra] += 1
if not family_names:
# None of these are useful.
return None, None, None
family_name = family_names.most_common(1)[0][0]
given_name = None
name_extra = None
# Now find the most common given name, given the most
# common family name.
given_names = given_name_for_family_name[family_name]
if given_names:
given_name = given_names.most_common(1)[0][0]
extra = extra_for_given_name_and_family_name[
(family_name, given_name)]
if extra:
name_extra, count = extra.most_common(1)[0]
# Don't add extra stuff on to the name if it's a
# viable option.
if extra[None] == count:
name_extra = None
return given_name, family_name, name_extra
def remove_commas_from(self, namepart):
"""Strip dangling commas from a namepart."""
if namepart.endswith(","):
namepart = namepart[:-1]
if namepart.startswith(","):
namepart = namepart[1:]
return namepart.strip()
def extract_name_from_unimarc(self, unimarc):
"""Turn a UNIMARC tag into a 4-tuple:
(given name, family name, extra, sort name)
"""
data = dict()
sort_name_in_progress = []
for (code, key) in (
('a', 'family'),
('b', 'given'),
('c', 'extra'),
):
value = self._xpath1(unimarc, 'ns2:subfield[@code="%s"]' % code)
if value is not None and value.text:
value = value.text
value = self.remove_commas_from(value)
sort_name_in_progress.append(value)
data[key] = value
return (data.get('given', None), data.get('family', None),
data.get('extra', None), ", ".join(sort_name_in_progress))
class VIAFClient(object):
LOOKUP_URL = 'http://viaf.org/viaf/%(viaf)s/viaf.xml'
SEARCH_URL = 'http://viaf.org/viaf/search?query={scope}+all+%22{author_name}%22&sortKeys=holdingscount&maximumRecords={maximum_records:d}&startRecord={start_record:d}&httpAccept=text/xml'
SUBDIR = "viaf"
MEDIA_TYPE = Representation.TEXT_XML_MEDIA_TYPE
REPRESENTATION_MAX_AGE = 60*60*24*30*6 # 6 months
def __init__(self, _db):
self._db = _db
self.parser = VIAFParser()
self.log = logging.getLogger("VIAF Client")
@property
def data_source(self):
return DataSource.lookup(self._db, DataSource.VIAF)
def process_contributor(self, contributor):
""" Accepts a Contributor object, and asks VIAF for information on the contributor's name.
Finds the VIAF cluster that's most likely to correspond to the passed-in contributor.
Finds any possible duplicate Contributor objects in our database, and
updates them with the information gleaned from VIAF.
:return: a ContributorData object filled with display, sort, family, and wikipedia names
from VIAF or None on error.
"""
if contributor.viaf:
contributor_candidate = self.lookup_by_viaf(
contributor.viaf, contributor.sort_name, contributor.display_name
)
else:
known_titles = set()
if contributor.contributions:
for contribution in contributor.contributions:
if contribution.edition and contribution.edition.title:
known_titles.add(contribution.edition.title)
contributor_candidate = self.lookup_by_name(
sort_name=contributor.sort_name, display_name=contributor.display_name,
known_titles=list(known_titles)
)
if not contributor_candidate:
# No good match was identified.
return None
(selected_candidate, match_confidences, contributor_titles) = contributor_candidate
if selected_candidate.viaf is not None:
# Is there already another contributor with this VIAF?
earliest_duplicate = self._db.query(Contributor).\
filter(Contributor.viaf==selected_candidate.viaf).\
filter(Contributor.id!=contributor.id).first()
if earliest_duplicate:
if earliest_duplicate.display_name == selected_candidate.display_name:
selected_candidate.apply(earliest_duplicate)
contributor.merge_into(earliest_duplicate)
return
else:
# TODO: This might be okay or it might be a
# problem we need to address. Whatever it is,
# don't merge the records. Instead, apply the VIAF
# data to the provided contributor, potentially
# creating an accursed duplicate.
self.log.warn(
"AVOIDING POSSIBLE SPURIOUS AUTHOR MERGE: %r => %r",
selected_candidate, earliest_duplicate
)
selected_candidate.apply(contributor)
def select_best_match(self, candidates, working_sort_name, known_titles=None):
"""Gets the best VIAF match from a series of potential matches
Return a tuple containing the selected_candidate (a ContributorData
object), a dict of match_confidences, and a list of titles by the
contributor.
:param known_titles: A list of titles we know this author wrote.
"""
# Sort for the best match and select the first.
candidates = self.parser.order_candidates(
working_sort_name=working_sort_name,
contributor_candidates=candidates,
known_titles=known_titles
)
if not candidates:
return None
(selected_candidate, match_confidences, contributor_titles) = candidates[0]
if (not selected_candidate or "total" not in match_confidences or
match_confidences["total"] < 70):
# The best match is dubious. Best to avoid this.
return None
return selected_candidate, match_confidences, contributor_titles
def lookup_name_title(self, viaf, do_get=None):
url = self.LOOKUP_URL % dict(viaf=viaf)
r, cached = Representation.get(
self._db, url, do_get=do_get, max_age=self.REPRESENTATION_MAX_AGE
)
xml = r.content
cluster = etree.fromstring(xml, parser=etree.XMLParser(recover=True))
titles = []
for potential_title in self.parser.name_titles_for_cluster(cluster):
titles.append(potential_title)
return titles
def lookup_by_viaf(self, viaf, working_sort_name=None,
working_display_name=None, do_get=None):
url = self.LOOKUP_URL % dict(viaf=viaf)
r, cached = Representation.get(
self._db, url, do_get=do_get, max_age=self.REPRESENTATION_MAX_AGE
)
xml = r.content
return self.parser.parse(xml, working_sort_name, working_display_name)
def lookup_by_name(self, sort_name, display_name=None, do_get=None,
known_titles=None):
"""
Asks VIAF for a list of author clusters, matching the passed-in
author name. Selects the cluster we deem the best match for
the author we mean.
:param sort_name: Author name in Last, First format.
:param display_name: Author name in First Last format.
:param do_get: Ask Representation to use Http GET?
:param known_titles: A list of titles we know this author wrote.
:return: (selected_candidate, match_confidences, contributor_titles) for selected ContributorData.
"""
author_name = sort_name or display_name
# from OCLC tech support:
# VIAF's SRU endpoint can only return a maximum number of 10 records
# when the recordSchema is http://viaf.org/VIAFCluster
maximum_records = 10 # viaf maximum that's not ignored
page = 1
contributor_candidates = []
# limit ourselves to reading the first 500 viaf clusters, on the
# assumption that search match quality is unlikely to be usable after that.
for page in range (1, 51):
start_record = 1 + maximum_records * (page-1)
scope = 'local.personalNames'
if is_corporate_name(author_name):
scope = 'local.corporateNames'
url = self.SEARCH_URL.format(
scope=scope, author_name=author_name.encode("utf8"),
maximum_records=maximum_records, start_record=start_record
)
representation, cached = Representation.get(
self._db, url, do_get=do_get, max_age=self.REPRESENTATION_MAX_AGE
)
xml = representation.content
candidates = self.parser.parse_multiple(xml, sort_name, display_name, page)
if not any(candidates):
# Delete the representation so it's not cached.
self._db.query(Representation).filter(
Representation.id==representation.id
).delete()
# We ran out of clusters, so we can relax and move on to
# ordering the returned results
break
contributor_candidates.extend(candidates)
page += 1
best_match = self.select_best_match(candidates=contributor_candidates,
working_sort_name=author_name, known_titles=known_titles)
return best_match
class MockVIAFClient(VIAFClient):
def __init__(self, _db):
super(MockVIAFClient, self).__init__(_db)
self.log = logging.getLogger("Mocked VIAF Client")
base_path = os.path.split(__file__)[0]
self.resource_path = os.path.join(base_path, "tests", "files", "viaf")
self.viaf_lookups = []
self.name_lookups = []
def get_data(self, filename):
# returns contents of sample file as xml string
path = os.path.join(self.resource_path, filename)
data = open(path).read()
return data
def lookup_by_viaf(self, viaf, working_sort_name=None,
working_display_name=None, do_get=None):
self.viaf_lookups.append(
(viaf, working_sort_name, working_display_name)
)
xml = self.get_data("mindy_kaling.xml")
return self.parser.parse(xml, working_sort_name, working_display_name)
def lookup_by_name(self, sort_name, display_name=None, do_get=None,
known_titles=None):
self.name_lookups.append(
(sort_name, display_name, known_titles)
)
def do_get(*args, **kwargs):
return (200, {}, self.get_data("mindy_kaling.xml"))
return super(MockVIAFClient, self).lookup_by_name(
sort_name, display_name, do_get, known_titles
)