diff --git a/bulk/management/commands/export_history.py b/bulk/management/commands/export_history.py new file mode 100644 index 00000000..1e0aa723 --- /dev/null +++ b/bulk/management/commands/export_history.py @@ -0,0 +1,204 @@ +import json +from functools import lru_cache +from collections import defaultdict +from django.core.management.base import BaseCommand +from opencivicdata.legislative.models import Bill, VoteEvent, LegislativeSession +from history.models import Change + + +def format_time( + event_time__year, + event_time__month, + event_time__day, + event_time__hour=None, + event_time__minute=None, +): + timestr = f"{event_time__year}-{event_time__month:02d}-{event_time__day:02d}" + if event_time__hour is not None: + timestr += f"T{event_time__hour:02d}" + if event_time__minute is not None: + timestr += f":{event_time__minute:02d}" + return timestr + + +@lru_cache() +def get_item_properties(obj_id=None, session_id=None): + if session_id: + session = LegislativeSession.objects.get(pk=session_id) + elif obj_id.startswith("ocd-bill"): + session = Bill.objects.get(pk=obj_id).legislative_session + elif obj_id.startswith("ocd-vote"): + session = VoteEvent.objects.get(pk=obj_id).legislative_session + return { + "jurisdiction_id": session.jurisdiction_id, + "session_id": session.identifier, + } + + +def accumulate_changes(changes): + """ + roll up all changes in a list to be attached to the appropriate ID + """ + # for each type, we return a dict mapping ids to change objects + accum = { + "bill": defaultdict(list), + "vote": defaultdict(list), + "related_entity": defaultdict(list), + "version_link": defaultdict(list), + "document_link": defaultdict(list), + } + + for c in changes: + if c.object_id.startswith("ocd-bill"): + ctype = "bill" + elif c.object_id.startswith("ocd-vote"): + ctype = "vote" + elif c.table_name == "opencivicdata_billactionrelatedentity": + ctype = "related_entity" + elif c.table_name == "opencivicdata_billversionlink": + ctype = "version_link" + elif c.table_name == "opencivicdata_billdocumentlink": + ctype = "document_link" + else: + raise ValueError("unexpected id: " + c.object_id) + accum[ctype][c.object_id].append(c) + return accum + + +MAPPING = { + "opencivicdata_billaction": "actions", + "opencivicdata_billversion": "versions", + "opencivicdata_billdocument": "documents", + "opencivicdata_billsponsorship": "sponsors", + "opencivicdata_billsource": "sources", + "opencivicdata_billabstract": "abstracts", + "opencivicdata_personvote": "votes", + "opencivicdata_votecount": "counts", + "opencivicdata_votesource": "sources", +} + + +def update_old(old, change): + """ like dict.update, but keeps first value it sees """ + for k, v in change.items(): + if k not in old: + old[k] = v + + +def clean_subobj(subobj): + """ we don't want to show these since we're just nesting the objects """ + subobj.pop("id") + subobj.pop("bill_id", None) + subobj.pop("vote_id", None) + return subobj + + +def make_change_object(changes): + """ + changes is a list of changes that are for the same object + + return a single object representing all changes as one + """ + # start with the the parent object id + item_id = changes[0].object_id + item_type = "bill" if item_id.startswith("ocd-bill") else "vote" + # unless we see a top-level create or delete, this is an update + change_type = "update" + old_obj = {} + new_obj = {} + + for change in changes: + if ( + change.table_name == "opencivicdata_bill" + or change.table_name == "opencivicdata_voteevent" + ): + if change.change_type == "update": + update_old(old_obj, change.old) + new_obj.update(change.new) + elif change.change_type == "create": + new_obj.update(change.new) + change_type = "create" + elif change.change_type == "delete": + change_type = "delete" + old_obj = change.old + else: + raise ValueError(change.change_type) + else: + # standard subfield handling + field_name = MAPPING[change.table_name] + + if field_name not in new_obj: + new_obj[field_name] = [] + if field_name not in old_obj: + old_obj[field_name] = [] + + # subfields are either deleted or created, updates don't currently happen via pupa + if change.change_type == "delete": + old_obj[field_name].append(clean_subobj(change.old)) + elif change.change_type == "create": + new_obj[field_name].append(clean_subobj(change.new)) + else: + print(change.object_id, change.change_type, field_name, change.new) + raise ValueError("update unexpected") + + if change_type != "delete": + item_properties = get_item_properties(item_id) + else: + item_properties = get_item_properties( + session_id=old_obj["legislative_session_id"] + ) + return { + "item_type": item_type, + "item_id": item_id, + "item_properties": item_properties, + "action": change_type, + "old": old_obj if change_type != "create" else None, + "new": new_obj if change_type != "delete" else None, + } + + +def handle_epoch(**kwargs): + """ + get all changes for a time period and return list of change_objects + """ + changes = list(Change.objects.filter(**kwargs)) + formatted = format_time(**kwargs) + print(f"{len(changes)} changes for {formatted}") + changes = accumulate_changes(changes) + print(f"{len(changes['bill'])} bills, {len(changes['vote'])} votes") + + # TODO: handle subobjects + for bill_id, obj_changes in changes["bill"].items(): + change_obj = make_change_object(obj_changes) + yield change_obj + for vote_id, obj_changes in changes["vote"].items(): + change_obj = make_change_object(obj_changes) + yield change_obj + + +def output_json(epoch, data): + output = { + "version": "0.1", + "source": "https://openstates.org", + "epoch": epoch, + "changes": [], + } + for item in data: + output["changes"].append(item) + + with open(f"changelog_{epoch}.json", "w") as f: + json.dump(output, f, indent=1) + + +class Command(BaseCommand): + help = "export history data" + + def handle(self, *args, **options): + for time in Change.objects.values( + "event_time__year", + "event_time__month", + "event_time__day", + "event_time__hour", + ).distinct(): + formatted = format_time(**time) + output_json(formatted, handle_epoch(**time)) diff --git a/docker/init-db.sh b/docker/init-db.sh index 635b5785..1fd043d1 100755 --- a/docker/init-db.sh +++ b/docker/init-db.sh @@ -15,7 +15,7 @@ FILE=data.pgdump if [ ! -f "$FILE" ]; then wget https://data.openstates.org/postgres/monthly/$WHICH-public.pgdump -O $FILE fi -PGPASSWORD=openstates pg_restore --host db --user openstates -d openstatesorg $FILE; +PGPASSWORD=openstates pg_restore --disable-triggers --host db --user openstates -d openstatesorg $FILE; # rm $FILE; poetry run ./manage.py update_materialized_views --initial diff --git a/history/__init__.py b/history/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/history/management/__init__.py b/history/management/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/history/management/commands/__init__.py b/history/management/commands/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/history/management/commands/history_install.py b/history/management/commands/history_install.py new file mode 100644 index 00000000..ef4e8790 --- /dev/null +++ b/history/management/commands/history_install.py @@ -0,0 +1,81 @@ +from django.conf import settings +from django.core.management.base import BaseCommand +from django.db import connection +from collections import namedtuple + + +GET_TRIGGERS_SQL = """ +select event_object_schema as table_schema, + event_object_table as table_name, + trigger_schema, + trigger_name, + string_agg(event_manipulation, ',') as event, + action_timing as activation, + action_condition as condition, + action_statement as definition +from information_schema.triggers +group by 1,2,3,4,6,7,8 +order by table_schema, table_name; +""" + +CREATE_TRIGGER_TEMPLATE_SQL = """ +CREATE TRIGGER history_insert AFTER INSERT ON {0} + FOR EACH ROW EXECUTE PROCEDURE history_insert(); +CREATE TRIGGER history_delete AFTER DELETE ON {0} + FOR EACH ROW EXECUTE PROCEDURE history_delete(); +CREATE TRIGGER history_update AFTER UPDATE ON {0} + FOR EACH ROW EXECUTE PROCEDURE history_update(); +""" + + +class Command(BaseCommand): + help = "install required triggers for history tracking" + + def add_arguments(self, parser): + parser.add_argument("--dry-run", action="store_true") + parser.add_argument("--uninstall", action="store_true") + + def _exec_sql(self, statement): + with connection.cursor() as cursor: + if self.dry_run: + print(statement) + else: + cursor.execute(statement) + + def uninstall(self): + drop_sql = "" + count = 0 + tables = set() + with connection.cursor() as cursor: + cursor.execute(GET_TRIGGERS_SQL) + desc = cursor.description + nt_result = namedtuple("Result", [col[0] for col in desc]) + for row in cursor.fetchall(): + result = nt_result(*row) + if result.trigger_name in ( + "history_insert", + "history_update", + "history_delete", + ): + drop_sql += ( + f"DROP TRIGGER {result.trigger_name} ON {result.table_name};\n" + ) + count += 1 + tables.add(result.table_name) + + print( + f"Found {count} existing triggers installed on {len(tables)} tables, dropping them." + ) + self._exec_sql(drop_sql) + + def handle(self, *args, **options): + self.dry_run = options["dry_run"] + if self.dry_run: + print("Dry Run: SQL will be printed but database will not be modified.") + if options["uninstall"]: + self.uninstall() + for table in settings.HISTORY_TABLES: + print(f"Installing triggers on {table}.") + self._exec_sql(CREATE_TRIGGER_TEMPLATE_SQL.format(table, table, table)) + print("Installing get_object_id function from settings.") + self._exec_sql(settings.HISTORY_GET_OBJECT_ID_SQL) diff --git a/history/migrations/0001_initial.py b/history/migrations/0001_initial.py new file mode 100644 index 00000000..00b3061b --- /dev/null +++ b/history/migrations/0001_initial.py @@ -0,0 +1,33 @@ +# Generated by Django 2.2.3 on 2020-02-10 20:01 + +import django.contrib.postgres.fields.jsonb +from django.db import migrations, models + + +class Migration(migrations.Migration): + + initial = True + + dependencies = [] + + operations = [ + migrations.CreateModel( + name="Change", + fields=[ + ( + "id", + models.AutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("event_time", models.DateTimeField(editable=False)), + ("table_name", models.CharField(max_length=100)), + ("object_id", models.CharField(max_length=45)), + ("old", django.contrib.postgres.fields.jsonb.JSONField(null=True)), + ("new", django.contrib.postgres.fields.jsonb.JSONField(null=True)), + ], + ) + ] diff --git a/history/migrations/0002_jsonb_history.py b/history/migrations/0002_jsonb_history.py new file mode 100644 index 00000000..63fe3452 --- /dev/null +++ b/history/migrations/0002_jsonb_history.py @@ -0,0 +1,114 @@ +from django.db import migrations, models + +""" +These SQL functions are derived from code and posts written by Glyn Astill + +http://8kb.co.uk/blog/2015/01/19/copying-pavel-stehules-simple-history-table-but-with-the-jsonb-type/ +""" + +JSON_DELETE_OP = """ +-- +-- Glyn Astill 16/01/2015 +-- Attempt at hstore style delete operator for jsonb +-- + +SET search_path = 'public'; + +CREATE OR REPLACE FUNCTION jsonb_delete_left(a jsonb, b text) +RETURNS jsonb AS +$BODY$ + SELECT COALESCE( + ( + SELECT ('{' || string_agg(to_json(key) || ':' || value, ',') || '}') + FROM jsonb_each(a) + WHERE key <> b + ) + , '{}')::jsonb; +$BODY$ +LANGUAGE sql IMMUTABLE STRICT; +COMMENT ON FUNCTION jsonb_delete_left(jsonb, text) IS 'delete key in second argument from first argument'; + +CREATE OPERATOR - ( PROCEDURE = jsonb_delete_left, LEFTARG = jsonb, RIGHTARG = text); +COMMENT ON OPERATOR - (jsonb, text) IS 'delete key from left operand'; + +-- + +CREATE OR REPLACE FUNCTION jsonb_delete_left(a jsonb, b text[]) +RETURNS jsonb AS +$BODY$ + SELECT COALESCE( + ( + SELECT ('{' || string_agg(to_json(key) || ':' || value, ',') || '}') + FROM jsonb_each(a) + WHERE key <> ALL(b) + ) + , '{}')::jsonb; +$BODY$ +LANGUAGE sql IMMUTABLE STRICT; +COMMENT ON FUNCTION jsonb_delete_left(jsonb, text[]) IS 'delete keys in second argument from first argument'; + +CREATE OPERATOR - ( PROCEDURE = jsonb_delete_left, LEFTARG = jsonb, RIGHTARG = text[]); +COMMENT ON OPERATOR - (jsonb, text[]) IS 'delete keys from left operand'; + +-- + +CREATE OR REPLACE FUNCTION jsonb_delete_left(a jsonb, b jsonb) +RETURNS jsonb AS +$BODY$ + SELECT COALESCE( + ( + SELECT ('{' || string_agg(to_json(key) || ':' || value, ',') || '}') + FROM jsonb_each(a) + WHERE NOT ('{' || to_json(key) || ':' || value || '}')::jsonb <@ b + ) + , '{}')::jsonb; +$BODY$ +LANGUAGE sql IMMUTABLE STRICT; +COMMENT ON FUNCTION jsonb_delete_left(jsonb, jsonb) IS 'delete matching pairs in second argument from first argument'; + +CREATE OPERATOR - ( PROCEDURE = jsonb_delete_left, LEFTARG = jsonb, RIGHTARG = jsonb); +COMMENT ON OPERATOR - (jsonb, jsonb) IS 'delete matching pairs from left operand'; +""" + +TRIGGER_SQL = """ +CREATE OR REPLACE FUNCTION history_insert() +RETURNS TRIGGER AS $$ +DECLARE + object_id varchar(100) := get_object_id(TG_TABLE_NAME, NEW); +BEGIN + INSERT INTO history_change(event_time, table_name, object_id, new) + VALUES(CURRENT_TIMESTAMP, TG_TABLE_NAME, object_id, row_to_json(NEW)::jsonb); + RETURN NEW; +END; +$$ LANGUAGE plpgsql; + +CREATE OR REPLACE FUNCTION history_delete() +RETURNS TRIGGER AS $$ +DECLARE + object_id varchar(100) := get_object_id(TG_TABLE_NAME, OLD); +BEGIN + INSERT INTO history_change(event_time, table_name, object_id, old) + VALUES(CURRENT_TIMESTAMP, TG_TABLE_NAME, object_id, row_to_json(OLD)::jsonb); + RETURN NEW; +END; +$$ LANGUAGE plpgsql; + +CREATE OR REPLACE FUNCTION history_update() +RETURNS TRIGGER AS $$ +DECLARE + js_new jsonb := row_to_json(NEW)::jsonb; + js_old jsonb := row_to_json(OLD)::jsonb; + object_id varchar(100) := get_object_id(TG_TABLE_NAME, OLD); +BEGIN + INSERT INTO history_change(event_time, table_name, object_id, old, new) + VALUES(CURRENT_TIMESTAMP, TG_TABLE_NAME, object_id, js_old - js_new, js_new - js_old); + RETURN NEW; +END; +$$ LANGUAGE plpgsql; +""" + + +class Migration(migrations.Migration): + dependencies = [("history", "0001_initial")] + + operations = [migrations.RunSQL(JSON_DELETE_OP), migrations.RunSQL(TRIGGER_SQL)] diff --git a/history/migrations/__init__.py b/history/migrations/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/history/models.py b/history/models.py new file mode 100644 index 00000000..fb0c23e9 --- /dev/null +++ b/history/models.py @@ -0,0 +1,22 @@ +from django.db import models +from django.contrib.postgres.fields import JSONField + + +class Change(models.Model): + event_time = models.DateTimeField(editable=False) + table_name = models.CharField(max_length=100) + object_id = models.CharField(max_length=45) + old = JSONField(null=True) + new = JSONField(null=True) + + @property + def change_type(self): + if self.old is None: + return "create" + elif self.new is None: + return "delete" + else: + return "update" + + class Meta: + ordering = ["event_time"] diff --git a/history/tests/__init__.py b/history/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/history/tests/test_bill_history.py b/history/tests/test_bill_history.py new file mode 100644 index 00000000..f3c25fc3 --- /dev/null +++ b/history/tests/test_bill_history.py @@ -0,0 +1,206 @@ +import pytest +from django.core.management import call_command +from opencivicdata.core.models import Jurisdiction, Division, Organization +from opencivicdata.legislative.models import Bill, LegislativeSession, VoteEvent +from ..models import Change + + +@pytest.mark.django_db +def setup(): + call_command("history_install") + + +@pytest.fixture +def session(): + d = Division.objects.create(id="ocd-division/country:us/state:ak", name="Alaska") + j = Jurisdiction.objects.create(name="Alaska", id="ak", division=d) + s = LegislativeSession.objects.create(identifier="2020", jurisdiction=j) + return s + + +@pytest.fixture +def org(): + j = Jurisdiction.objects.get() + return Organization.objects.create( + jurisdiction=j, name="House", classification="lower" + ) + + +@pytest.mark.django_db +def test_bill_insert(session): + b = Bill.objects.create( + title="Test bill.", identifier="SB 1", legislative_session=session + ) + assert Change.objects.count() == 1 + bh = Change.objects.get() + assert bh.object_id == b.id + assert bh.table_name == "opencivicdata_bill" + assert bh.old is None + assert bh.new["title"] == "Test bill." + assert bh.new["identifier"] == "SB 1" + + +@pytest.mark.django_db +def test_bill_update(session): + b = Bill.objects.create( + title="mistake", identifier="SB 1", legislative_session=session + ) + b.title = "corrected" + b.save() + + assert Change.objects.count() == 2 + update = Change.objects.order_by("event_time")[1] + assert set(update.old.keys()) == {"title", "updated_at"} + assert update.old["title"] == "mistake" + assert update.new["title"] == "corrected" + + +@pytest.mark.django_db +def test_bill_delete(session): + b = Bill.objects.create( + title="title", identifier="SB 1", legislative_session=session + ) + b.delete() + + assert Change.objects.count() == 2 + delete = Change.objects.order_by("event_time")[1] + assert delete.old["identifier"] == "SB 1" + assert delete.new is None + + +@pytest.mark.django_db +def test_bill_action_insert(session, org): + b = Bill.objects.create( + title="title", identifier="SB 1", legislative_session=session + ) + b.actions.create( + description="introduced", date="2020-01-01", order=1, organization=org + ) + assert Change.objects.count() == 2 + action = Change.objects.order_by("event_time")[1] + assert action.old is None + assert action.object_id == b.id + assert action.new["description"] == "introduced" + assert action.table_name == "opencivicdata_billaction" + + +@pytest.mark.django_db +def test_bill_action_update(session, org): + b = Bill.objects.create( + title="title", identifier="SB 1", legislative_session=session + ) + a = b.actions.create( + description="introduced", date="2020-01-01", order=1, organization=org + ) + a.classification = ["introduced"] + a.save() + + assert Change.objects.count() == 3 + action = Change.objects.order_by("event_time")[2] + assert action.old == {} + assert action.new["classification"] == ["introduced"] + assert action.object_id == b.id + assert action.table_name == "opencivicdata_billaction" + + +@pytest.mark.django_db +def test_bill_action_delete(session, org): + b = Bill.objects.create( + title="title", identifier="SB 1", legislative_session=session + ) + a = b.actions.create( + description="introduced", date="2020-01-01", order=1, organization=org + ) + a.delete() + + assert Change.objects.count() == 3 + action = Change.objects.order_by("event_time")[2] + assert action.old["description"] == "introduced" + assert action.new is None + assert action.object_id == b.id + assert action.table_name == "opencivicdata_billaction" + + +# NOTE: +# these next tests are for objects that don't have a direct object_id, and are special cased +# in the pl/pgsql code, we don't need to check *all* the functionality, but lets make sure +# they link to the right parent id + + +@pytest.mark.django_db +def test_bill_versionlink_add(session, org): + b = Bill.objects.create( + title="title", identifier="SB 1", legislative_session=session + ) + v = b.versions.create(note="first printing", date="2020-01-01") + v.links.create(url="https://example.com") + + history = Change.objects.order_by("event_time") + assert len(history) == 3 + # table names and object_id are recorded correctly + assert history[0].object_id == history[1].object_id + assert history[2].object_id == str(v.id) + assert history[2].table_name == "opencivicdata_billversionlink" + + +@pytest.mark.django_db +def test_bill_documentlink_add(session, org): + b = Bill.objects.create( + title="title", identifier="SB 1", legislative_session=session + ) + v = b.documents.create(note="fiscal note", date="2020-01-01") + v.links.create(url="https://example.com") + + history = Change.objects.order_by("event_time") + assert len(history) == 3 + # table names and object_id are recorded correctly + assert history[0].object_id == history[1].object_id + assert history[2].object_id == str(v.id) + assert history[2].table_name == "opencivicdata_billdocumentlink" + + +@pytest.mark.django_db +def test_bill_actionrelatedentity_add(session, org): + b = Bill.objects.create( + title="title", identifier="SB 1", legislative_session=session + ) + a = b.actions.create( + description="introduced", date="2020-01-01", order=1, organization=org + ) + a.related_entities.create(name="someone", entity_type="person") + + history = Change.objects.order_by("event_time") + assert len(history) == 3 + # table names and object_id are recorded correctly + assert history[0].object_id == history[1].object_id + assert history[2].object_id == str(a.id) + assert history[2].table_name == "opencivicdata_billactionrelatedentity" + + +# vote tests similarly do not need to test the actual behavior, just that they work + + +@pytest.mark.django_db +def test_vote_objects_history(session, org): + v = VoteEvent.objects.create( + identifier="test vote", + start_date="2020-01-01", + legislative_session=session, + organization=org, + ) + v.counts.create(option="yes", value=1) + v.votes.create(option="yes", voter_name="someone") + v.sources.create(url="https://example.com") + history = Change.objects.order_by("event_time") + assert len(history) == 4 + assert ( + history[0].object_id + == history[1].object_id + == history[2].object_id + == history[3].object_id + ) + # table names and object_id are recorded correctly + assert history[0].table_name == "opencivicdata_voteevent" + assert history[1].table_name == "opencivicdata_votecount" + assert history[2].table_name == "opencivicdata_personvote" + assert history[3].table_name == "opencivicdata_votesource" diff --git a/openstates/settings.py b/openstates/settings.py index bc07f0a0..17aadf1d 100644 --- a/openstates/settings.py +++ b/openstates/settings.py @@ -108,6 +108,7 @@ "bulk", "profiles.apps.ProfilesConfig", "simplekeys", + "history", ] MIDDLEWARE = [ @@ -236,3 +237,44 @@ "Login and visit https://openstates.org/account/profile/ for your API key. " "contact@openstates.org to raise limits" ) + +HISTORY_TABLES = [ + "opencivicdata_bill", + "opencivicdata_billabstract", + "opencivicdata_billtitle", + "opencivicdata_billidentifier", + "opencivicdata_billaction", + "opencivicdata_relatedbill", + "opencivicdata_billsponsorship", + "opencivicdata_billdocument", + "opencivicdata_billversion", + "opencivicdata_billsource", + "opencivicdata_billversionlink", + "opencivicdata_billdocumentlink", + "opencivicdata_billactionrelatedentity", + "opencivicdata_voteevent", + "opencivicdata_votecount", + "opencivicdata_personvote", + "opencivicdata_votesource", +] + +HISTORY_GET_OBJECT_ID_SQL = """ +CREATE OR REPLACE FUNCTION get_object_id(table_name name, r RECORD) returns varchar(100) as $$ +BEGIN + CASE table_name + WHEN 'opencivicdata_bill', 'opencivicdata_voteevent' THEN + RETURN r.id; + WHEN 'opencivicdata_billactionrelatedentity' THEN + RETURN r.action_id; + WHEN 'opencivicdata_billdocumentlink' THEN + RETURN r.document_id; + WHEN 'opencivicdata_billversionlink' THEN + RETURN r.version_id; + WHEN 'opencivicdata_votecount', 'opencivicdata_personvote', 'opencivicdata_votesource' THEN + RETURN r.vote_event_id; + ELSE + RETURN r.bill_id; + END CASE; +END +$$ LANGUAGE plpgsql; +"""