Skip to content

Commit

Permalink
Merge pull request #1237 from SEED-platform/v2-release-hotfix
Browse files Browse the repository at this point in the history
Release hotfix (matching integrity)
  • Loading branch information
axelstudios authored Feb 3, 2017
2 parents 3daf0f8 + 41923d5 commit 2ceea20
Show file tree
Hide file tree
Showing 2 changed files with 92 additions and 39 deletions.
126 changes: 89 additions & 37 deletions seed/data_importer/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from celery import chord
from celery import shared_task
from celery.utils.log import get_task_logger
from django.db import IntegrityError
from django.db.models import Q
from unidecode import unidecode

Expand Down Expand Up @@ -989,10 +990,28 @@ def union_lol(lol):
return (canonical_states, noncanonical_states)


# NJACHECK - Check function for accuracy
class EquivalencePartitioner(object):
""" TODO: Document purpose of class. Move this class into its own location.
TODO: Fix lowerCamelCase methods"""

def __init__(self, equivalence_class_description, identity_fields):
# If identify fields are not equal between two classes then we
# force the class to not be equivalent.

# self.equiv_compare_func = self.makeKeyEquivalenceFunction(equivalence_class_description)
self.equiv_comparison_key_func = self.makeResolvedKeyCalculationFunction(
equivalence_class_description)
self.equiv_canonical_key_func = self.makeCanonicalKeyCalculationFunction(
equivalence_class_description)

self.identity_key_func = self.makeCanonicalKeyCalculationFunction(
[(x,) for x in identity_fields])

return

@classmethod
def makeDefaultStateEquivalence(kls, equivalence_type):
def make_default_state_equivalence(kls, equivalence_type):
if equivalence_type == PropertyState:
return kls.makePropertyStateEquivalence()
elif equivalence_type == TaxLotState:
Expand Down Expand Up @@ -1046,45 +1065,54 @@ def calculate_key_equivalence(key1, key2):

@classmethod
def makePropertyStateEquivalence(kls):
property_equivalence_fields = [("pm_property_id", "custom_id_1"),
("custom_id_1",),
("normalized_address",)]
return kls(property_equivalence_fields)
property_equivalence_fields = [
("pm_property_id", "custom_id_1"),
("custom_id_1",),
("normalized_address",)
]
property_noequivalence_fields = ["pm_property_id"]

return kls(property_equivalence_fields, property_noequivalence_fields)

@classmethod
def makeTaxLotStateEquivalence(kls):
tax_lot_equivalence_fields = [("jurisdiction_tax_lot_id", "custom_id_1"),
("custom_id_1",),
("normalized_address",)]
return kls(tax_lot_equivalence_fields)

def __init__(self, equivalence_class_description):
# self.equiv_compare_func = self.makeKeyEquivalenceFunction(equivalence_class_description)
self.equiv_comparison_key_func = self.makeResolvedKeyCalculationFunction(
equivalence_class_description)
self.equiv_canonical_key_func = self.makeCanonicalKeyCalculationFunction(
equivalence_class_description)
return
tax_lot_equivalence_fields = [
("jurisdiction_tax_lot_id", "custom_id_1"),
("custom_id_1",),
("normalized_address",)
]
tax_lot_noequivalence_fields = ["jurisdiction_tax_lot_id"]
return kls(tax_lot_equivalence_fields, tax_lot_noequivalence_fields)

def calculate_comparison_key(self, obj):
return self.equiv_comparison_key_func(obj)

def calculate_canonical_key(self, obj):
return self.equiv_canonical_key_func(obj)

# def calculate_object_equivalence(self, key, obj):
# return self.equiv_compare_func(key, obj)
def calculate_identity_key(self, obj):
return self.identity_key_func(obj)

def key_needs_merging(self, original_key, new_key):
return True in [not a and b for (a, b) in zip(original_key, new_key)]

def merge_keys(self, key1, key2):
return [a if a else b for (a, b) in zip(key1, key2)]

def identities_are_different(self, key1, key2):
for (x, y) in zip(key1, key2):
if x is None or y is None:
continue
if x != y:
return True
else:
return False

def calculate_equivalence_classes(self, list_of_obj):
# TODO: Finish writing the equivalence class code.

equivalence_classes = collections.defaultdict(list)
identities_for_equivalence = {}

# There is some subtlety with whether we use "comparison" keys
# or "canonical" keys. This reflects the difference between
Expand All @@ -1098,17 +1126,24 @@ def calculate_equivalence_classes(self, list_of_obj):
for (ndx, obj) in enumerate(list_of_obj):
cmp_key = self.calculate_comparison_key(obj)
can_key = self.calculate_canonical_key(obj)
identity_key = self.calculate_identity_key(obj)

for class_key in equivalence_classes:
if self.calculate_key_equivalence(class_key, cmp_key):
if self.calculate_key_equivalence(class_key,
cmp_key) and not self.identities_are_different(
identities_for_equivalence[class_key], identity_key):

# Must check the identities to make sure all is a-ok.
equivalence_classes[class_key].append(ndx)

if self.key_needs_merging(class_key, cmp_key):
merged_key = self.merge_keys(class_key, cmp_key)
equivalence_classes[merged_key] = equivalence_classes.pop(class_key)
identities_for_equivalence[merged_key] = identity_key
break
else:
equivalence_classes[can_key].append(ndx)
identities_for_equivalence[can_key] = identity_key
return equivalence_classes # TODO: Make sure return is correct on this.


Expand All @@ -1117,9 +1152,24 @@ def match_and_merge_unmatched_objects(unmatched_states, partitioner, org, import
unmatched_tax_lot_states and returns a set of states that
correspond to unmatched states."""

_log.debug("Starting to map_and_merge_unmatched_objects")

# Sort unmatched states/This shouldn't be happening!
unmatched_states.sort(key=lambda state: state.pk)

# current_match_cycle = import_file.cycle
# current_match_cycle = Cycle.objects.filter(organization = org).order_by('-start').first()

def getattrdef(obj, attr, default):
if hasattr(obj, attr):
return getattr(obj, attr)
else:
return default

keyfunction = lambda ndx: (getattrdef(unmatched_states[ndx], "release_date", None),
getattrdef(unmatched_states[ndx], "generation_date", None),
getattrdef(unmatched_states[ndx], "pk", None))

# This removes any states that are duplicates,
equivalence_classes = partitioner.calculate_equivalence_classes(unmatched_states)

Expand All @@ -1128,6 +1178,8 @@ def match_and_merge_unmatched_objects(unmatched_states, partitioner, org, import
merged_objects = []

for (class_key, class_ndxs) in equivalence_classes.items():
class_ndxs.sort(key=keyfunction)

if len(class_ndxs) == 1:
merged_objects.append(unmatched_states[class_ndxs[0]])
continue
Expand All @@ -1147,6 +1199,7 @@ def match_and_merge_unmatched_objects(unmatched_states, partitioner, org, import
else:
merged_objects.append(merged_result)

_log.debug("DONE with map_and_merge_unmatched_objects")
return merged_objects, equivalence_classes.keys()


Expand All @@ -1168,7 +1221,7 @@ def merge_unmatched_into_views(unmatched_states, partitioner, org, import_file):
ParentAttrName = "property"
elif isinstance(unmatched_states[0], TaxLotState):
ObjectViewClass = TaxLotView
ParentAttrName = "tax_lot"
ParentAttrName = "taxlot"
else:
raise ValueError("Unknown class '{}' passed to merge_unmatched_into_views".format(
type(unmatched_states[0])))
Expand Down Expand Up @@ -1208,12 +1261,17 @@ def merge_unmatched_into_views(unmatched_states, partitioner, org, import_file):
else:
# Grab another view that has the same parent as
# the one we belong to.
cousin_view = existing_view_states[key].values()[0].values()[0][0]
cousin_view = existing_view_states[key].values()[0]
view_parent = getattr(cousin_view, ParentAttrName)
new_view = type(cousin_view)()
setattr(new_view, ParentAttrName, view_parent)
new_view.save()
matched_views.append(new_view)
new_view.cycle = current_match_cycle
new_view.state = unmatched
try:
new_view.save()
matched_views.append(new_view)
except IntegrityError:
_log.warn("Unable to save the new view as it already exists in the db")

break
else:
Expand Down Expand Up @@ -1244,7 +1302,7 @@ def _match_properties_and_taxlots(file_pk, user_pk):
unmatched_properties, duplicate_property_states = filter_duplicated_states(
all_unmatched_properties)

property_partitioner = EquivalencePartitioner.makeDefaultStateEquivalence(PropertyState)
property_partitioner = EquivalencePartitioner.make_default_state_equivalence(PropertyState)

# Merge everything together based on the notion of equivalence
# provided by the partitioner.
Expand All @@ -1263,17 +1321,15 @@ def _match_properties_and_taxlots(file_pk, user_pk):

# Do the same process with the TaxLots.
all_unmatched_tax_lots = import_file.find_unmatched_tax_lot_states()

if all_unmatched_tax_lots:
unmatched_tax_lots, duplicate_tax_lot_states = filter_duplicated_states(
all_unmatched_tax_lots)

taxlot_partitioner = EquivalencePartitioner.makeDefaultStateEquivalence(TaxLotState)

taxlot_partitioner = EquivalencePartitioner.make_default_state_equivalence(TaxLotState)
unmatched_tax_lots, taxlot_equivalence_keys = match_and_merge_unmatched_objects(
unmatched_tax_lots,
taxlot_partitioner, org,
import_file)

merged_taxlot_views = merge_unmatched_into_views(unmatched_tax_lots, taxlot_partitioner,
org, import_file)
else:
Expand Down Expand Up @@ -1472,6 +1528,7 @@ def save_state_match(state1, state2, confidence=None, user=None,
assert AuditLogClass.objects.filter(state=state1).count() >= 1
assert AuditLogClass.objects.filter(state=state2).count() >= 1

# NJACHECK - is this logic correct?
state_1_audit_log = AuditLogClass.objects.filter(state=state1).first()
state_2_audit_log = AuditLogClass.objects.filter(state=state2).first()

Expand Down Expand Up @@ -1534,19 +1591,14 @@ def pair_new_states(merged_property_views, merged_taxlot_views):
global taxlot_m2m_keygen
global property_m2m_keygen

taxlot_m2m_keygen = EquivalencePartitioner(tax_cmp_fmt)
property_m2m_keygen = EquivalencePartitioner(prop_cmp_fmt)
taxlot_m2m_keygen = EquivalencePartitioner(tax_cmp_fmt, ["jurisdiction_tax_lot_id"])
property_m2m_keygen = EquivalencePartitioner(prop_cmp_fmt, ["pm_property_id", "jurisdiction_property_id"])

import time
st = time.time()
property_views = PropertyView.objects.filter(state__organization=org, cycle=cycle).values_list(
*prop_comparison_field_names)
taxlot_views = TaxLotView.objects.filter(state__organization=org, cycle=cycle).values_list(
*tax_comparison_field_names)

et = time.time()
print "{} seconds.".format(et - st)

# For each of the view objects, make an
prop_type = namedtuple("Prop", prop_comparison_fields)
taxlot_type = namedtuple("TL", tax_comparison_fields)
Expand Down Expand Up @@ -1595,7 +1647,7 @@ def pair_new_states(merged_property_views, merged_taxlot_views):
# pdb.set_trace()

pv_key = property_m2m_keygen.calculate_comparison_key(pv.state)
# TODO: Refactor pronto. This iterating over the tax lot is totally bogus and I hate it.
# TODO: Refactor pronto. Iterating over the tax lot is ad implementation.
for tlk in taxlot_keys:
if pv_key[0] and ";" in pv_key[0]:
for lotnum in map(lambda x: x.strip(), pv_key[0].split(";")):
Expand Down
5 changes: 3 additions & 2 deletions seed/data_importer/tests/test_demo_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,8 +162,9 @@ def test_demo_v2(self):
organization=self.org,
import_file=self.import_file_property,
)
# There will be 4 properties that will still be in the DATA_STATE_MAPPING
self.assertEqual(len(ps), 4)

# there shouldn't be any properties left in the mapping state
self.assertEqual(len(ps), 0)

# psv = PropertyView.objects.filter(state__organization=self.org)
# self.assertEqual(len(psv), 12)
Expand Down

0 comments on commit 2ceea20

Please sign in to comment.