forked from J535D165/FEBRL-fork-v0.4.2
-
Notifications
You must be signed in to change notification settings - Fork 0
/
indexing.py
7629 lines (5699 loc) · 285 KB
/
indexing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# =============================================================================
# AUSTRALIAN NATIONAL UNIVERSITY OPEN SOURCE LICENSE (ANUOS LICENSE)
# VERSION 1.3
#
# The contents of this file are subject to the ANUOS License Version 1.3
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at:
#
# https://sourceforge.net/projects/febrl/
#
# Software distributed under the License is distributed on an "AS IS"
# basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
# the License for the specific language governing rights and limitations
# under the License.
#
# The Original Software is: "indexing.py"
#
# The Initial Developer of the Original Software is:
# Dr Peter Christen (Research School of Computer Science, The Australian
# National University)
#
# Copyright (C) 2002 - 2011 the Australian National University and
# others. All Rights Reserved.
#
# Contributors:
#
# Alternatively, the contents of this file may be used under the terms
# of the GNU General Public License Version 2 or later (the "GPL"), in
# which case the provisions of the GPL are applicable instead of those
# above. The GPL is available at the following URL: http://www.gnu.org/
# If you wish to allow use of your version of this file only under the
# terms of the GPL, and not to allow others to use your version of this
# file under the terms of the ANUOS License, indicate your decision by
# deleting the provisions above and replace them with the notice and
# other provisions required by the GPL. If you do not delete the
# provisions above, a recipient may use your version of this file under
# the terms of any one of the ANUOS License or the GPL.
# =============================================================================
#
# Freely extensible biomedical record linkage (Febrl) - Version 0.4.2
#
# See: http://datamining.anu.edu.au/linkage.html
#
# =============================================================================
"""Module with classes for indexing/blocking.
This module provides classes for building blocks and indices that will be
used in the linkage process for comparing record pairs in an efficient way.
Various derived classes are provided that implement different indexing
techniques:
FullIndex Performs no indexing but does the full comparison
of all record pairs (quadratic complexity in the
size of the number of records in the two data
sets).
BlockingIndex The 'standard' blocking index used for record
linkage.
SortingIndex Based on a sliding window over the sorted values
of the index variable definitions - uses an
inverted index approach where keys are unique
index variable values.
SortingArrayIndex Based on a sliding window over the sorted values
of the index variable definitions - uses an array
based approach where all index variable values
(including duplicates) are stored.
AdaptSortingIndex An adaptive version of the array based sorted
index.
QGramIndex Allows for fuzzy indexing with 'overlapping'
blocks, like clustering.
CanopyIndex Based on TF-IDF/Jaccard and canopy clustering.
StringMapIndex Based on the string-map multi-dimensional mapping
algorithm combined with canopy clustering.
SuffixArrayIndex Based on a suffix array, resulting in a similar
approach as the SortingIndex (as blocks are
created by going through the sorted suffix array).
RobustSuffixArrayIndex Combines the suffix array approach with the sorted
window approach to overcome variations in the
index key values.
BigMatchIndex Based on the BigMatch program developed by the US
Census Bureau. This index can only handle linkages
(not deduplications). Based on the idea to only
index the smaller data set into an inverted index,
and then process each record from the larger data
set as it is read from file.
DedupIndex A index specialised for deduplications. It
performs the build(), compact() and run() in one
routine by reading of the file, building of the
index and comparisons of record pairs are all done
in the run() routine.
When initialising an index its index variables have to be defined using the
attribute 'index_def' (see more details below).
Creating and using an index normally consists of the following three steps:
- build Read the records from the data sets and build the indexing data
structures.
- compact Make the indexing data structures more compact (more efficient
for accessing the record pairs).
- run Run the comparison step (i.e. compare record pairs) on the index.
Main bottlenecks in the implemented indices are:
- QGramIndex: Creating of the sub-lists (recursively), especially for
long index values and low thresholds. Two different
functions are implemented and used for different threshold
values.
- SortingIndex: The way the inverted index data is combined when a sliding
window is created.
- CanopyIndex: Creating canopies, again especially for index values having
many q-grams and large data set (resulting in long inverted
index lists).
"""
# =============================================================================
# Import necessary modules (Python standard modules first, then Febrl modules)
import csv
import heapq
import gc
import logging
import math
import random
import shelve
import time
import auxiliary
import dataset
import encode
# =============================================================================
class Indexing:
"""Base class for indexing. Handles index initialisation, as well as saving
and loading of indices to/from files.
Indexing (or blocking as it is called in traditional record linkage) is
used to reduce the large number of possible record pair comparisons by
grouping together records into 'blocks' that are similar according to some
criteria.
Indices have to be defined when an index is initialised. Basically, an
index is specific to two fields (one per data set). An index definition is
made of a list containing one or more sub-lists each made of:
1) The name of a field from data set 1 which is used for the index.
2) The name of a field from data set 2 which is used for the index.
3) A flag, if set to True and the index value contains more than one word
(i.e. contains whitespace characters), then these words will be sorted
first (before being possibly reversed and then encoded). For example,
a value 'peter a. christen' would be sorted into 'a. christen peter'.
If set to False, no such sorting will be done.
4) A flag, if set to True the index variable (assumed to be a string) will
be reversed before it is used within an index definition, if set to
False it is not reversed.
5) A positive integer specifying the maximum length (in characters) of the
index variable; or None (no maximum length of the index variable).
6) A list made of a function and its input parameters that will be applied
to the data set field values to create an index variable. If this list
is empty (or no list is given but None instead), then the field values
will be taken directly as index
variables.
The function can be any function that has a string as its first input
argument and returns a string.
The final index variable values are the concatenated values (possibly with
a separator string between as detailed below) of each of the index
definitions.
Here is an example index definition:
index_def_1 = [['surname','sname',True,True,None,[encode.soundex]],
['postcode','zipcode',False,False,3,[]]]
For this index, index variables will be the concatenations of the Soundex
encoding of the sorted (if containing more than one word) reversed
surnames (fields 'surname' in data set 1 and field 'sname' in data set 2)
and the first three characters (digits) from the fields 'postcode' from
data set 1 and 'zipcode' from data set 2.
So for a record ['peter','meier','2000','sydney'] in data set 1 and a
record ['paul','meyer','2010','sydney south'] the following two index
variable values will be constructed (and inserted into the index data
structure):
['peter','meier','2000',sydney'] -> 'r500'+'200' -> 'r500200'
['paul','meyer','2010','sydney south'] -> 'r500'+'201' -> 'r500201'
So these two records will (depending upon the index implementation) likely
be put into different 'blocks' and therefore not being compared.
Here is another example:
index_def_2 = \
[['year_of_birth','yob',False,False,None,[encode.get_substring,3,5]],
['locality_name','loc_name',True,True,None,[encode.dmetaphone, 4]]]
In this example, the third and fourth year digits extracted from the
fields 'year_of_birth' (from data set 1) and 'yob' (from data set 2) will
be concatenated with the Double-Metaphone encoding (first 4 characters of
the encoding only) of the sorted and reversed values from the fields
'locality_name' (from data set 1) and 'loc_name' (from data set 2).
The final index definition given to an index when it is initialised is a
list of index definitions, for example:
full_index = indexing.BlockingIndex(description = 'Example full index',
dataset1 = example_csv_data_set,
dataset2 = example_csv_data_set,
index_def = [index_def_1,
index_def_2],
index_sep_str = ' ',
rec_comparator = example_rec_comp)
In this example, two indices will be built according to the two index
definitions provided.
All index classed have the following instance variables, which can be set
when an index is initialised:
description A string describing the index
dataset1 The first data set the index is built for
dataset2 The second data set the index is built for
rec_comparator The record comparator
index_def Definitions of the indexes as described above
index_sep_str A separator string which will be inserted if an index
is made of more than one value. Default is the empty
string ''.
skip_missing A flag, if set to True records which have empty index
variable values will be skipped over, if set to False a
record with an empty indexing variable value will be
included in the index as well. Default value is True
log_funct This can be a Python function or method which will log
(print or save to a file) a progress report message. It
is assumed that this function or method has one input
argument of type string (the message to be printed).
Default is None, in which case the normal logging
module will be used.
progress_report Can be set to a percentage number between 1 and 50 in
which case a progress report is logged during the
record pair comparison stage (in the run() method)
every selected percentage number. Default values is 10.
If set to None no progress report will be logged.
weight_vec_file If this is set to a string (assumed to be a file name)
then the weight vectors will be written into this file
and NOT stored in the weight_vector dictionary. Each
weight vector is a comma separated line containing:
rec_id1, rec_id2, weight1, weight2, ... weightN
Default value is None, in which case the weight vectors
will not be written into a file but returned as a
dictionary.
Note that skip_missing cannot be set to False for certain index methods,
see their documentation for more details.
Both the data sets and the index definitions must be provided when a index
is initialised.
"""
# ---------------------------------------------------------------------------
def __init__(self, base_kwargs):
"""Constructor
"""
# General attributes for all index implementations
#
self.description = ''
self.dataset1 = None
self.dataset2 = None
self.rec_comparator = None # A record comparator
self.index_def = None # The index definitions
self.index_sep_str = '' # Separator string for in-between index values
self.skip_missing = True
self.progress_report = 10
self.log_funct = None
self.weight_vec_file = None
self.index_def_proc = None # Processed version of the index
# definition for faster access to field
# values
self.index1 = {} # The index data structure for data set 1
self.index2 = {} # The index data structure for data set 2
self.index1_shelve_name = None # If the index data structure for data
# set 1 is to be file (shelve) based this
# will be it's file name
self.index2_shelve_name = None # Same for data set 2
self.rec_cache1 = {} # A dictionary containing all records
# from data set 1 with only the fields
# needed for field comparisons
self.rec_cache2 = {} # Same for data set 2
self.rec_cache1_file_name = None # If the record cache for data sets 1
# should be file (shelve) based this will
# be it's file name
self.rec_cache2_file_name = None # Same for data sets 2
self.num_rec_pairs = None # The number of record pairs that will be
# compared when the run() method is
# called
self.do_deduplication = None # A Flag, set to True if both data sets
# are the same
self.comp_field_used1 = [] # A list of the fields in data set 1 as
# used by the record comparator (column
# indices)
self.comp_field_used2 = [] # Same for data set 2
self.rec_length_cache = {} # Used in lenth filtering in run() method
# Process base keyword arguments (all data set specific keywords were
# processed in the derived class constructor)
#
for (keyword, value) in base_kwargs.items():
if (keyword.startswith('desc')):
auxiliary.check_is_string('description', value)
self.description = value
elif (keyword == 'dataset1'):
auxiliary.check_is_list('Dataset 1 field list', value.field_list)
self.dataset1 = value
elif (keyword == 'dataset2'):
auxiliary.check_is_list('Dataset 2 field list', value.field_list)
self.dataset2 = value
elif (keyword.startswith('index_d')):
auxiliary.check_is_list('index_def', value)
self.index_def = value
elif (keyword.startswith('index_sep')):
auxiliary.check_is_string('index_sep_str', value)
self.index_sep_str = value
elif (keyword.startswith('index1_she')):
auxiliary.check_is_string('index1_shelve_name', value)
self.index1_shelve_name = value
elif (keyword.startswith('index2_she')):
auxiliary.check_is_string('index2_shelve_name', value)
self.index2_shelve_name = value
elif (keyword.startswith('rec_cache1_f')):
auxiliary.check_is_string('rec_cache1_file_name', value)
self.rec_cache1_file_name = value
elif (keyword.startswith('rec_cache2_f')):
auxiliary.check_is_string('rec_cache2_file_name', value)
self.rec_cache2_file_name = value
elif (keyword.startswith('rec_com')):
self.rec_comparator = value
elif (keyword.startswith('skip')):
auxiliary.check_is_flag('skip_missing', value)
self.skip_missing = value
elif (keyword.startswith('progr')):
if (value == None):
self.progress_report = value
else:
auxiliary.check_is_integer('progress_report', value)
if ((value < 1) or (value > 50)):
logging.exception('Illegal value for progress report, must be ' + \
'False or between 1 and 50: "%s"' % \
(str(value)))
raise Exception
self.progress_report = value
elif (keyword.startswith('log_f')):
auxiliary.check_is_function_or_method('log_funct', value)
self.log_funct = value
elif (keyword.startswith('weight_v')):
if (value != None):
auxiliary.check_is_string('weight_vec_file', value)
self.weight_vec_file = value
else:
logging.exception('Illegal constructor argument keyword: '+keyword)
raise Exception
# Check if the needed attributes are set - - - - - - - - - - - - - - - - -
#
auxiliary.check_is_list('index_def', self.index_def)
auxiliary.check_is_not_none('rec_comparator', self.rec_comparator)
auxiliary.check_is_not_none('Dataset 1', self.dataset1)
auxiliary.check_is_not_none('Dataset 2', self.dataset2)
auxiliary.check_is_list('Dataset 1 field list', self.dataset1.field_list)
auxiliary.check_is_list('Dataset 2 field list', self.dataset2.field_list)
# Check if the data sets in the record comparator are the same as the ones
# give in the index
#
if (self.rec_comparator.dataset1 != self.dataset1):
logging.exception('Data set 1 in record comparator is different from' + \
' data set 1 in index: "%s" / "%s"' % \
(self.rec_comparator.dataset1, self.dataset1))
raise Exception
if (self.rec_comparator.dataset2 != self.dataset2):
logging.exception('Data set 2 in record comparator is different from' + \
' data set 2 in index: "%s" / "%s"' % \
(self.rec_comparator.dataset2, self.dataset2))
raise Exception
# Check if a deduplication will be done or a linkage - - - - - - - - - - -
#
if (self.dataset1 == self.dataset2):
self.do_deduplication = True
else:
self.do_deduplication = False
# If indices or record caches are file (shelve) based open these shelves -
#
if (self.index1_shelve_name != None):
self.index1 = self.__open_shelve_file__(self.index1_shelve_name)
if (self.index2_shelve_name != None):
self.index2 = self.__open_shelve_file__(self.index2_shelve_name)
if (self.rec_cache1_file_name != None):
self.rec_cache1 = self.__open_shelve_file__(self.rec_cache1_file_name)
if (self.rec_cache2_file_name != None):
self.rec_cache2 = self.__open_shelve_file__(self.rec_cache2_file_name)
# Extract the field names from the two data set field name lists - - - - -
#
dataset1_field_names = []
dataset2_field_names = []
for (field_name, field_data) in self.dataset1.field_list:
dataset1_field_names.append(field_name)
for (field_name, field_data) in self.dataset2.field_list:
dataset2_field_names.append(field_name)
# Get the column indices of the fields used in the record comparator - - -
#
for (comp,f_ind1,f_ind2) in self.rec_comparator.field_comparison_list:
if (f_ind1 not in self.comp_field_used1):
self.comp_field_used1.append(f_ind1)
if (f_ind2 not in self.comp_field_used2):
self.comp_field_used2.append(f_ind2)
self.comp_field_used1.sort()
self.comp_field_used2.sort()
# Check if definition of indices is correct and fields are in the data sets
#
self.index_def_proc = [] # Checked and processed index definitions will be
# added here
for index_def_list in self.index_def:
auxiliary.check_is_list('Index definition list "%s"' % \
(str(index_def_list)), index_def_list)
index_def_list_proc = []
for index_def in index_def_list:
auxiliary.check_is_list('Index definition "%s"' % \
(str(index_def)), index_def)
if (index_def == []):
logging.info('Empty index definition given: %s' % \
(str(self.index_def)))
# Check the two field names
#
field_name1 = index_def[0]
field_name2 = index_def[1]
if (field_name1 not in dataset1_field_names):
logging.exception('Field "%s" is not in data set 1 field name ' \
% (field_name1) + 'list: %s' % \
(str(self.dataset1.field_list)))
raise Exception
field_index1 = dataset1_field_names.index(field_name1)
if (field_name2 not in dataset2_field_names):
logging.exception('Field "%s" is not in data set 2 field name ' \
% (field_name2) + 'list: %s' % \
(str(self.dataset2.field_list)))
raise Exception
field_index2 = dataset2_field_names.index(field_name2)
index_def_proc = [field_index1,field_index2] # Processed index def.
# Check if sort words flag is True or False
#
auxiliary.check_is_flag('Sort words flag', index_def[2])
index_def_proc.append(index_def[2])
# Check if reverse flag is True or False
#
auxiliary.check_is_flag('Reverse flag', index_def[3])
index_def_proc.append(index_def[3])
# Check maximum length is a positive integer or None
#
if (index_def[4] == None):
index_def_proc.append(None)
else:
auxiliary.check_is_integer('Maximum length', index_def[4])
auxiliary.check_is_positive('Maximum length', index_def[4])
index_def_proc.append(index_def[4])
# Check function definition
#
if ((index_def[5] != None) and (len(index_def[5]) > 0)):
index_funct_def = index_def[5]
auxiliary.check_is_function_or_method('Function "%s"' % \
(index_funct_def[0]), index_funct_def[0])
index_def_proc.append(index_funct_def)
else:
index_def_proc.append(None)
index_def_list_proc.append(index_def_proc)
self.index_def_proc.append(index_def_list_proc)
assert len(self.index_def) == len(self.index_def_proc)
self.status = 'initialised' # Status of the index (used by save and load
# methods)
# ---------------------------------------------------------------------------
def build(self):
"""Method to build an index data structure.
See implementations in derived classes for details.
"""
logging.exception('Override abstract method in derived class')
raise Exception
# ---------------------------------------------------------------------------
def __records_into_inv_index__(self):
"""Load the records from the data sets and put them into an inverted index
data structure.
If both data sets are the same (a deduplication) only insert records
from one data set.
This method builds an inverted index (one per index definition) as a
Python dictionary with the keys being the indexing values (as returned
by the _get_index_values__() method.
"""
logging.info('Started to build inverted index:')
num_indices = len(self.index_def)
# Index data structure for blocks is one dictionary per index - - - - - - -
#
for i in range(num_indices):
self.index1[i] = {} # Index for data set 1
self.index2[i] = {} # Index for data set 2
get_index_values_funct = self.__get_index_values__ # Shorthands
skip_missing = self.skip_missing
# A list of data structures needed for the build process:
# - the index data structure (dictionary)
# - the record cache
# - the data set to be read
# - the comparison fields which are used
# - a list index (0 for data set 1, 1 for data set 2)
#
build_list = [(self.index1, self.rec_cache1, self.dataset1,
self.comp_field_used1, 0)] # For data set 1
if (self.do_deduplication == False): # If linkage append data set 2
build_list.append((self.index2, self.rec_cache2, self.dataset2,
self.comp_field_used2, 1))
# Reading loop over all records in one or both data set(s) - - - - - - - -
#
for (index,rec_cache,dataset,comp_field_used_list,ds_index) in build_list:
# Calculate a counter for the progress report
#
if (self.progress_report != None):
progress_report_cnt = max(1, int(dataset.num_records / \
(100.0 / self.progress_report)))
else: # So no progress report is being logged
progress_report_cnt = dataset.num_records + 1
start_time = time.time()
rec_read = 0 # Number of records read from data set
for (rec_ident, rec) in dataset.readall(): # Read all records in data set
# Extract record fields needed for comparisons (set all others to '')
#
comp_rec = []
field_ind = 0
for field in rec:
if (field_ind in comp_field_used_list):
comp_rec.append(field.lower()) # Make them lower case
else:
comp_rec.append('')
field_ind += 1
rec_cache[rec_ident] = comp_rec # Put into record cache
# Now get the index variable values for this record - - - - - - - - - -
#
rec_index_val_list = get_index_values_funct(rec, ds_index)
for i in range(num_indices): # Put record identifier into all indices
this_index = index[i] # Shorthand
block_val = rec_index_val_list[i]
if ((block_val != '') or (skip_missing == False)):
block_val_rec_list = this_index.get(block_val, [])
block_val_rec_list.append(rec_ident)
this_index[block_val] = block_val_rec_list
rec_read += 1
if ((rec_read % progress_report_cnt) == 0):
self.__log_build_progress__(rec_read,dataset.num_records,start_time)
used_sec_str = auxiliary.time_string(time.time()-start_time)
rec_time_str = auxiliary.time_string((time.time()-start_time) / \
dataset.num_records)
logging.info('Read and indexed %d records in %s (%s per record)' % \
(dataset.num_records, used_sec_str, rec_time_str))
logging.info('')
# ---------------------------------------------------------------------------
# Get sub-list functions are used for the q-gram and BigMatch index
def __get_sublists1__(self, in_list, min_len):
"""An iterative method to compute all combinations of sub-lists of the list
'in_list' of lengths down to 'min_len'.
This routine seems to be faster for larger 'min_len' values (compared to
length of the input list), i.e. which would correspond to less deep
recursive levels (that correspond to higher threshold values) compared
to the recursive method using sets given below.
"""
all_list = [in_list]
in_len = len(in_list)
if (in_len > min_len):
for i in xrange(in_len):
sub_list = in_list[:i]+in_list[i+1:]
if (sub_list not in all_list):
all_list.append(sub_list)
if ((in_len-1) > min_len):
for sub_sub_list in self.__get_sublists1__(sub_list, min_len):
if (sub_sub_list not in all_list):
all_list.append(sub_sub_list)
return all_list
# ---------------------------------------------------------------------------
def __get_sublists2__(self, in_list, min_len):
"""Method to recursively compute all combinations of sub-lists of the list
'in_list' of lengths down to 'min_len' using a Python generator.
This routine seems to be faster for deeper recursive levels (that
correspond to lower threshold values) compared to the iterative method
using sets given above.
"""
unique_combinations_funct = self.__unique_combinations2__ # Shorthand
all_list = [in_list]
in_len = len(in_list)
for i in xrange(in_len-1, min_len-1, -1):
for sub_list in unique_combinations_funct(in_list, i):
if (sub_list not in all_list):
all_list.append(sub_list)
return all_list
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def __unique_combinations__(self, in_list, n):
"""Get unique combinations of length 'n' of the given input list. Based on
a recipe from Activestate Python cookbook, see:
http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/190465
"""
unique_combinations_funct = self.__unique_combinations__ # Shorthand
if (n == 0):
yield []
else:
for i in xrange(len(in_list)):
for sub_list in unique_combinations_funct(in_list[i+1:],n-1):
yield [in_list[i]]+sub_list
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def __unique_combinations2__(self, in_list, n):
"""Another version from Activestate Python cookbook, see:
http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/66465
Seems to be of similar speed than the previous function.
"""
unique_combinations_funct = self.__unique_combinations2__ # Shorthand
for i in xrange(len(in_list)):
if (n == 1):
yield in_list[i]
else:
for sub_list in unique_combinations_funct(in_list[i+1:], n-1):
yield in_list[i] + sub_list
# ---------------------------------------------------------------------------
def compact(self):
"""Method to compact an index data structure.
See implementations in derived classes for details.
"""
logging.exception('Override abstract method in derived class')
raise Exception
# ---------------------------------------------------------------------------
def __dedup_rec_pairs__(self, rec_id_list, rec_pair_dict):
"""Create record pairs for a deduplication using the given record
identifier list and insert them into the given record pair dictionary.
This version does not modify the input record identifier list. It does
create a local copy of the record identifer list which is then sorted.
"""
rec_cnt = 1 # Counter for second record identifier
this_rec_id_list = rec_id_list[:]
this_rec_id_list.sort()
for rec_ident1 in this_rec_id_list:
rec_ident2_set = rec_pair_dict.get(rec_ident1, set())
for rec_ident2 in this_rec_id_list[rec_cnt:]:
assert rec_ident1 != rec_ident2
rec_ident2_set.add(rec_ident2)
rec_pair_dict[rec_ident1] = rec_ident2_set
rec_cnt += 1
del this_rec_id_list
# ---------------------------------------------------------------------------
def __link_rec_pairs__(self, rec_id_list1, rec_id_list2, rec_pair_dict):
"""Create record pairs for a linkage using the given two record identifier
lists and insert them into the given record pair dictionary.
"""
for rec_ident1 in rec_id_list1:
for rec_ident2 in rec_id_list2:
rec_ident2_set = rec_pair_dict.get(rec_ident1, set())
rec_ident2_set.add(rec_ident2)
rec_pair_dict[rec_ident1] = rec_ident2_set
# ---------------------------------------------------------------------------
def run(self):
"""Run the record pair comparison accoding to the index.
See implementations in derived classes for details.
"""
logging.exception('Override abstract method in derived class')
raise Exception
# ---------------------------------------------------------------------------
def __get_field_names_list__(self):
"""Returns the list of all field comparison descriptions, to be used when
a weight vector file is written.
"""
field_names_list = []
for field_comp_tuple in self.rec_comparator.field_comparator_list:
field_names_list.append(field_comp_tuple[0].description)
return field_names_list
# ---------------------------------------------------------------------------
def __compare_rec_pairs_from_dict__(self, length_filter_perc = None,
cut_off_threshold = None):
"""This method compares all the records pairs in the record pair dictionary
and puts the resulting weight vectors into a dictionary which is then
returned.
If the argument 'length_filter_perc' is set to a percentage value
between 0.0 and 100.0 the lengths of the concatenated record values in
the fields used for comparison will be calculated, and if the difference
is larger than the given percentage value the record pair will not be
compared. For example, if a record A has values with a total length of
12 characters and a record B values with a length of 16 characters, then
their percentage difference will be:
|12-16| / (max(12,16) = 4 / 16 = 0.25 (25%)
So if the value of 'length_filter_perc' is set to 20 this record pair
will be filtered out and not compared.
Default value for 'length_filter_perc' is None, which means no length
filtering will be performed. For more information about filtering in the
record linkage process please refer to:
- Adaptive Filtering for Efficient Record Linkage
Lifang Gu and Rohan Baxter,
SIAM Data Mining conference, 2004.
The second argument 'cut_off_threshold' can be set to a numerical value,
in which case all compared record pairs with a summed weight vector
value less than this threshold will not be stored in the weight vector
dictionary. Default value for 'cut_off_threshold' is None, which means
all compared record pairs will be stored in the weight vector
dictionary.
"""
# Check if weight vector file should be written - - - - - - - - - - - - - -
#
if (self.weight_vec_file != None):
try:
weight_vec_fp = open(self.weight_vec_file, 'w')
except:
logging.exception('Cannot write weight vector file: %s' % \
(self.weight_vec_file))
raise Exception
weight_vec_writer = csv.writer(weight_vec_fp)
# Write header line with descriptions of field comparisons
#
weight_vec_header_line = ['rec_id1', 'rec_id2'] + \
self.__get_field_names_list__()
weight_vec_writer.writerow(weight_vec_header_line)
# Calculate a counter for the progress report - - - - - - - - - - - - - - -
#
if (self.progress_report != None):
progress_report_cnt = max(1, int(self.num_rec_pairs / \
(100.0 / self.progress_report)))
else: # So no progress report is being logged
progress_report_cnt = self.num_rec_pairs + 1
weight_vec_dict = {} # Dictionary with calculated weight vectors
comp_done = 0 # Number of comparisons done
rec_cache1 = self.rec_cache1 # Shorthands to make program faster
rec_pair_dict = self.rec_pair_dict
rec_comp = self.rec_comparator.compare
rec_length_cache = self.rec_length_cache
# Check length filter and cut-off threshold arguments - - - - - - - - - - -
#
if (length_filter_perc != None):
auxiliary.check_is_percentage('Length filter percentage',
length_filter_perc)
logging.info(' Length filtering set to %.1f%%' % (length_filter_perc))
length_filter_perc /= 100.0 # Normalise
if (cut_off_threshold != None):
auxiliary.check_is_number('Cut-off threshold', cut_off_threshold)
logging.info(' Cut-off threshold set to: %.2f' % (cut_off_threshold))
num_rec_pairs_filtered = 0 # Count number of removed record pairs
num_rec_pairs_below_thres = 0
# Set shorthand depending upon deduplication or linkage - - - - - - - - - -
#
if (self.do_deduplication == True): # A deduplication run
rec_cache2 = self.rec_cache1
else:
rec_cache2 = self.rec_cache2
start_time = time.time()
for rec_ident1 in rec_pair_dict:
rec1 = rec_cache1[rec_ident1] # Get the actual first record
if (length_filter_perc != None):
rec1_len = len(''.join(rec1)) # Get length in characters for record
for rec_ident2 in rec_pair_dict[rec_ident1]:
rec2 = rec_cache2[rec_ident2] # Get actual second record
do_comp = True # Flag, specify if comparison should be done
if (length_filter_perc != None):
if (rec_ident2 in rec_length_cache): # Length is cached
rec2_len = rec_length_cache[rec_ident2]
else:
rec2_len = len(''.join(rec2))
rec_length_cache[rec_ident2] = rec2_len
perc_diff = float(abs(rec1_len - rec2_len)) / max(rec1_len, rec2_len)
if (perc_diff > length_filter_perc):
do_comp = False # Difference too large, don't do comparison
num_rec_pairs_filtered += 1
if (do_comp == True):
w_vec = rec_comp(rec1, rec2) # Compare them
if (cut_off_threshold == None) or (sum(w_vec) >= cut_off_threshold):
# Put result into weight vector dictionary
#
if (self.weight_vec_file == None):
weight_vec_dict[(rec_ident1, rec_ident2)] = w_vec
else:
weight_vec_writer.writerow([rec_ident1, rec_ident2]+w_vec)
else:
num_rec_pairs_below_thres += 1
comp_done += 1 # Count all record pair comparisons (even if not done)
if ((comp_done % progress_report_cnt) == 0):
self.__log_comparison_progress__(comp_done, start_time)
used_sec_str = auxiliary.time_string(time.time()-start_time)
rec_time_str = auxiliary.time_string((time.time()-start_time) / \
self.num_rec_pairs)
logging.info('Compared %d record pairs in %s (%s per pair)' % \
(self.num_rec_pairs, used_sec_str,rec_time_str))
if (length_filter_perc != None):
logging.info(' Length filtering (set to %.1f%%) filtered %d record ' % \
(length_filter_perc*100, num_rec_pairs_filtered) + 'pairs')
if (cut_off_threshold != None):
logging.info(' %d record pairs had summed weights below threshold ' % \
(num_rec_pairs_below_thres) + '%.2f' % (cut_off_threshold))
memory_usage_str = auxiliary.get_memory_usage()
if (memory_usage_str != None):
logging.info(' '+memory_usage_str)
if (self.weight_vec_file == None):
return [self.__get_field_names_list__(), weight_vec_dict]
else:
weight_vec_fp.close()
return None
# ---------------------------------------------------------------------------
def __find_closest__(self, sorted_list, elem):
"""Binary search of the given element 'elem' in the given sorted list, and
return index of exact match or closest match (before where the element
would be).
Used in BigMatch and Dedup indexing methods.
"""
start = 0
end = len(sorted_list)-1
while 1:
if (end < start):
return end # Not found
middle = (start + end) / 2
middle_val = sorted_list[middle] # Get the element from the middle
if (middle_val == elem): # Found the key value
return middle
elif (elem < middle_val):
end = middle - 1
else:
start = middle + 1
# ---------------------------------------------------------------------------
def load(self, index_file_name):
"""Load a previously saved index from a binary file.
"""
pass # To be done