-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathEW_1851_config.yaml
86 lines (78 loc) · 2.52 KB
/
EW_1851_config.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
census:
country: "EW"
year: 1851
uid_field: "RecID"
field_to_geocode: "Address"
boundaries_field: ["ConParID", "CEN_1851"]
subset_field: "subset_id"
census_file: "../data/input/census/1851_ew_geocode.txt"
read_csv_params:
sep: "\t" # passed to 'sep' parameter of pandas read_csv
encoding: "latin-1" # passed to 'encoding' parameter of pandas read_csv
quoting: 3 # must be int 0,1,2, or 3. Passed to 'quoting' parameter of pandas read_csv
na_values: [".", " ", " - "] # Passed to 'na_values' parameter of pandas read_csv
# nrows: 1000000
usecols: ["RecID", "Address", "ParID", "subset_id"]
field_to_clean: "Address"
standardisation_file: "../configuration/standardisation_files/icem_street_standardisation.json" # regex replacement file
min_len: 5
cleaned_field_suffix: "_alt"
unique_field_to_geocode_name: "address_uid"
lkups:
conpar:
lkup_file: "../data/input/ew/parish_dicts_encoding/1851_ICeM_DICTIONARY_CODED_conparidadded.txt"
lkup_uid_field: "ParID"
lkup_census_field: "ParID"
lkup_params:
encoding: "utf-8"
sep: "\t"
quoting: 3
usecols: ["ParID", "CEN_1851", "ConParID"]
write_processed_csv_params:
sep: "\t"
encoding: "utf-8"
index: False
write_processed_csv_params_slim:
columns: ["address_uid", "Address_alt", "ConParID", "CEN_1851", "subset_id"]
sep: "\t"
encoding: "utf-8"
index: False
comparers:
rapidfuzzy_wratio: "rapidfuzzy_wratio_s"
rapidfuzzy_partial_ratio_alignment: "align"
sim_comp_thresh: 0.9
align_thresh: 7
final_score_field: "fs"
boundaries:
boundary_1:
geom_name: "parish"
gis_file: "../data/input/ew/1851EngWalesParishandPlace/1851EngWalesParishandPlace_valid.shp"
gis_uid_field: "ID"
gis_read_params:
engine: "pyogrio"
columns: ["ID",]
# max_features: 10000
lkup_file: "../data/input/ew/icem_parish_lkup/UKDS_GIS_to_icem.xlsx"
lkup_field_uid: "UKDS_ID"
lkup_field_censuslink: "conparid_51-91"
lkup_read_params:
sheet_name: "link"
na_values: "."
usecols: ["UKDS_ID", "conparid_51-91", ]
gis_write_params:
sep: "\t"
encoding: "utf-8"
index: False
boundary_2:
geom_name: "rsd"
gis_file: "../data/input/ew/rsd_boundary_data/RSD_1851_1911_JR_valid.shp"
gis_uid_field: "CEN_1851"
gis_read_params:
engine: "pyogrio"
columns: ["CEN_1851"]
# max_features: 10000
# geom_format: None
gis_write_params:
sep: "\t"
encoding: "utf-8"
index: False