-
Notifications
You must be signed in to change notification settings - Fork 5
/
config_cd.txt
executable file
·47 lines (30 loc) · 1.69 KB
/
config_cd.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
##############################################################
# Config file for Cd dataset
# AnyLine containing `#' with be treated as comment.
# The Typical Format is Variablename = Value
###############################################################
# Choose K and L Wisely, try with higher values of K and go down. Typically in range 1-5 for small pruning,
#for rigorous pruning use larger values of K
K=1
# More L increases the recall but also reports more pair. Less sensitive than K
L=4
# ngrams length
shingles=4
# Thresholds, only reports if found in at least this many buckets (cancels random noise). If you are missing pairs #decrease this
Thresh=2
#Give the input CSV file. First line will be ignored (assumed to be header). Every line will be treated as a #record.
#The line number of record will be its ID. That is the fist line after header is treated as record with ID 1 etc.
Input=data/cd.csv
#Output File: this will contain a pair of record IDs in each line indicating a possible match.
Output=cd_pair.csv
##############################################################################
#These are advanced parameters depending on memory
##############################################################################
# No of Cells in each bucket. Decrease if goes out of memory.
BucketSize=8
# No of buckets in each tables is 2^{this number}. Too small will never finish. Decrease if goes out of memoryy. #Larger is better. Must be < 27
RangePow=23
# Increase if MinHashing Takes a lot of Time. Must be power of 2.
MinHashChunkSize=32
# Processes these many records in parallel, larger is faster. Decrease if goes out of memory
Chunk=500000