diff --git a/.github/workflows/check_unit_tests.yml b/.github/workflows/check_unit_tests.yml index 00e60c6b..100868e0 100644 --- a/.github/workflows/check_unit_tests.yml +++ b/.github/workflows/check_unit_tests.yml @@ -52,4 +52,4 @@ jobs: uses: codecov/codecov-action@v4 with: token: ${{ secrets.CODECOV_TOKEN }} - verbose: true \ No newline at end of file + verbose: true diff --git a/alembic.ini b/alembic.ini deleted file mode 100644 index 07284f3d..00000000 --- a/alembic.ini +++ /dev/null @@ -1,116 +0,0 @@ -# A generic, single database configuration. - -[alembic] -# path to migration scripts -# Use forward slashes (/) also on windows to provide an os agnostic path -script_location = alembic - -# template used to generate migration file names; The default value is %%(rev)s_%%(slug)s -# Uncomment the line below if you want the files to be prepended with date and time -# see https://alembic.sqlalchemy.org/en/latest/tutorial.html#editing-the-ini-file -# for all available tokens -# file_template = %%(year)d_%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d-%%(rev)s_%%(slug)s - -# sys.path path, will be prepended to sys.path if present. -# defaults to the current working directory. -prepend_sys_path = src - -# timezone to use when rendering the date within the migration file -# as well as the filename. -# If specified, requires the python>=3.9 or backports.zoneinfo library. -# Any required deps can installed by adding `alembic[tz]` to the pip requirements -# string value is passed to ZoneInfo() -# leave blank for localtime -# timezone = - -# max length of characters to apply to the "slug" field -# truncate_slug_length = 40 - -# set to 'true' to run the environment during -# the 'revision' command, regardless of autogenerate -# revision_environment = false - -# set to 'true' to allow .pyc and .pyo files without -# a source .py file to be detected as revisions in the -# versions/ directory -# sourceless = false - -# version location specification; This defaults -# to alembic/versions. When using multiple version -# directories, initial revisions must be specified with --version-path. -# The path separator used here should be the separator specified by "version_path_separator" below. -# version_locations = %(here)s/bar:%(here)s/bat:alembic/versions - -# version path separator; As mentioned above, this is the character used to split -# version_locations. The default within new alembic.ini files is "os", which uses os.pathsep. -# If this key is omitted entirely, it falls back to the legacy behavior of splitting on spaces and/or commas. -# Valid values for version_path_separator are: -# -# version_path_separator = : -# version_path_separator = ; -# version_path_separator = space -version_path_separator = os # Use os.pathsep. Default configuration used for new projects. - -# set to 'true' to search source files recursively -# in each "version_locations" directory -# new in Alembic version 1.10 -# recursive_version_locations = false - -# the output encoding used when revision files -# are written from script.py.mako -# output_encoding = utf-8 - -sqlalchemy.url = driver://user:pass@localhost/dbname - - -[post_write_hooks] -# post_write_hooks defines scripts or Python functions that are run -# on newly generated revision scripts. See the documentation for further -# detail and examples - -# format using "black" - use the console_scripts runner, against the "black" entrypoint -# hooks = black -# black.type = console_scripts -# black.entrypoint = black -# black.options = -l 79 REVISION_SCRIPT_FILENAME - -# lint with attempts to fix using "ruff" - use the exec runner, execute a binary -# hooks = ruff -# ruff.type = exec -# ruff.executable = %(here)s/.venv/bin/ruff -# ruff.options = --fix REVISION_SCRIPT_FILENAME - -# Logging configuration -[loggers] -keys = root,sqlalchemy,alembic - -[handlers] -keys = console - -[formatters] -keys = generic - -[logger_root] -level = WARN -handlers = console -qualname = - -[logger_sqlalchemy] -level = WARN -handlers = -qualname = sqlalchemy.engine - -[logger_alembic] -level = INFO -handlers = -qualname = alembic - -[handler_console] -class = StreamHandler -args = (sys.stderr,) -level = NOTSET -formatter = generic - -[formatter_generic] -format = %(levelname)-5.5s [%(name)s] %(message)s -datefmt = %H:%M:%S diff --git a/alembic/README.md b/alembic/README.md deleted file mode 100644 index 5ff3b47f..00000000 --- a/alembic/README.md +++ /dev/null @@ -1,23 +0,0 @@ -# Migrations - -## Overview - -To manage Master Patient Index (MPI) database migrations, we use [Alembic](https://alembic.sqlalchemy.org/en/latest/) due to its strong support for [SQLAlchemy](https://docs.sqlalchemy.org/en/20/orm/quickstart.html) and all the [databases](https://docs.sqlalchemy.org/en/20/dialects/) it supports. - -## How to Run a Migration - -### Update MPI Schema -**Step 1:** Update `src/recordlinker/linkage/models.py` with the changes you would like to make to the MPI schema. - -### Create a Migration Script -**Step 2:** In the CLI, in the root directory use the following command to run a revision: -```bash -alembic revision --autogenerate -m "" -``` -You can now see a new migration script in the `alembic/versions/` directory. - -### Run the Migration -**Step 3:** Finally, use the following command to run the migration: -```bash -alembic upgrade head -``` \ No newline at end of file diff --git a/alembic/env.py b/alembic/env.py deleted file mode 100644 index e32e9711..00000000 --- a/alembic/env.py +++ /dev/null @@ -1,77 +0,0 @@ -from logging.config import fileConfig - -from sqlalchemy import engine_from_config -from sqlalchemy import pool - -from alembic import context -from recordlinker.models import Base -from recordlinker.config import settings - -# this is the Alembic Config object, which provides -# access to the values within the .ini file in use. -config = context.config - -# Interpret the config file for Python logging. -# This line sets up loggers basically. -if config.config_file_name is not None: - fileConfig(config.config_file_name) -config.set_main_option("sqlalchemy.url", settings.db_uri) - -# add your model's MetaData object here -# for 'autogenerate' support -target_metadata = Base.metadata - -# other values from the config, defined by the needs of env.py, -# can be acquired: -# my_important_option = config.get_main_option("my_important_option") -# ... etc. - - -def run_migrations_offline() -> None: - """Run migrations in 'offline' mode. - - This configures the context with just a URL - and not an Engine, though an Engine is acceptable - here as well. By skipping the Engine creation - we don't even need a DBAPI to be available. - - Calls to context.execute() here emit the given string to the - script output. - - """ - url = config.get_main_option("sqlalchemy.url") - context.configure( - url=url, - target_metadata=target_metadata, - literal_binds=True, - dialect_opts={"paramstyle": "named"}, - ) - - with context.begin_transaction(): - context.run_migrations() - - -def run_migrations_online() -> None: - """Run migrations in 'online' mode. - - In this scenario we need to create an Engine - and associate a connection with the context. - - """ - connectable = engine_from_config( - config.get_section(config.config_ini_section, {}), - prefix="sqlalchemy.", - poolclass=pool.NullPool, - ) - - with connectable.connect() as connection: - context.configure(connection=connection, target_metadata=target_metadata) - - with context.begin_transaction(): - context.run_migrations() - - -if context.is_offline_mode(): - run_migrations_offline() -else: - run_migrations_online() diff --git a/alembic/script.py.mako b/alembic/script.py.mako deleted file mode 100644 index fbc4b07d..00000000 --- a/alembic/script.py.mako +++ /dev/null @@ -1,26 +0,0 @@ -"""${message} - -Revision ID: ${up_revision} -Revises: ${down_revision | comma,n} -Create Date: ${create_date} - -""" -from typing import Sequence, Union - -from alembic import op -import sqlalchemy as sa -${imports if imports else ""} - -# revision identifiers, used by Alembic. -revision: str = ${repr(up_revision)} -down_revision: Union[str, None] = ${repr(down_revision)} -branch_labels: Union[str, Sequence[str], None] = ${repr(branch_labels)} -depends_on: Union[str, Sequence[str], None] = ${repr(depends_on)} - - -def upgrade() -> None: - ${upgrades if upgrades else "pass"} - - -def downgrade() -> None: - ${downgrades if downgrades else "pass"} diff --git a/alembic/versions/0c90faa0378f_create_algorithm_tables.py b/alembic/versions/0c90faa0378f_create_algorithm_tables.py deleted file mode 100644 index 1281986a..00000000 --- a/alembic/versions/0c90faa0378f_create_algorithm_tables.py +++ /dev/null @@ -1,51 +0,0 @@ -"""create algorithm tables - -Revision ID: 0c90faa0378f -Revises: 64ed9566f189 -Create Date: 2024-09-20 11:41:13.377954 - -""" -from typing import Sequence, Union - -from alembic import op -import sqlalchemy as sa - - -# revision identifiers, used by Alembic. -revision: str = '0c90faa0378f' -down_revision: Union[str, None] = '64ed9566f189' -branch_labels: Union[str, Sequence[str], None] = None -depends_on: Union[str, Sequence[str], None] = None - - -def upgrade() -> None: - # ### commands auto generated by Alembic - please adjust! ### - op.create_table('algorithm', - sa.Column('id', sa.Integer(), nullable=False), - sa.Column('is_default', sa.Boolean(), nullable=False), - sa.Column('label', sa.String(length=255), nullable=False), - sa.Column('description', sa.Text(), nullable=False), - sa.PrimaryKeyConstraint('id'), - sa.UniqueConstraint('label') - ) - op.create_index(op.f('ix_algorithm_is_default'), 'algorithm', ['is_default'], unique=False) - op.create_table('algorithm_pass', - sa.Column('id', sa.Integer(), nullable=False), - sa.Column('algorithm_id', sa.Integer(), nullable=False), - sa.Column('blocking_keys', sa.JSON(), nullable=False), - sa.Column('evaluators', sa.JSON(), nullable=False), - sa.Column('rule', sa.String(length=255), nullable=False), - sa.Column('cluster_ratio', sa.Float(), nullable=False), - sa.Column('kwargs', sa.JSON(), nullable=False), - sa.ForeignKeyConstraint(['algorithm_id'], ['algorithm.id'], ), - sa.PrimaryKeyConstraint('id') - ) - # ### end Alembic commands ### - - -def downgrade() -> None: - # ### commands auto generated by Alembic - please adjust! ### - op.drop_table('algorithm_pass') - op.drop_index(op.f('ix_algorithm_is_default'), table_name='algorithm') - op.drop_table('algorithm') - # ### end Alembic commands ### diff --git a/alembic/versions/6052c193a26a_add_blockingvalue_table.py b/alembic/versions/6052c193a26a_add_blockingvalue_table.py deleted file mode 100644 index 30db9149..00000000 --- a/alembic/versions/6052c193a26a_add_blockingvalue_table.py +++ /dev/null @@ -1,56 +0,0 @@ -"""add BlockingValue table - -Revision ID: 6052c193a26a -Revises: 773a9759734c -Create Date: 2024-09-13 13:27:51.853408 - -""" -from typing import Sequence, Union - -from alembic import op -import sqlalchemy as sa - - -# revision identifiers, used by Alembic. -revision: str = '6052c193a26a' -down_revision: Union[str, None] = '773a9759734c' -branch_labels: Union[str, Sequence[str], None] = None -depends_on: Union[str, Sequence[str], None] = None - - -def upgrade() -> None: - # ### commands auto generated by Alembic - please adjust! ### - op.drop_table('mpi_blocking_key') - op.create_table('mpi_blocking_key', - sa.Column('id', sa.Integer(), nullable=False), - sa.Column('key', sa.String(length=50), nullable=False), - sa.PrimaryKeyConstraint('id') - ) - op.create_table('mpi_blocking_value', - sa.Column('id', sa.Integer(), nullable=False), - sa.Column('patient_id', sa.Integer(), nullable=False), - sa.Column('blockingkey_id', sa.Integer(), nullable=False), - sa.Column('value', sa.String(length=50), nullable=False), - sa.ForeignKeyConstraint(['blockingkey_id'], ['mpi_blocking_key.id'], ), - sa.ForeignKeyConstraint(['patient_id'], ['mpi_patient.id'], ), - sa.PrimaryKeyConstraint('id') - ) - op.create_index(op.f('ix_mpi_blocking_value_value'), 'mpi_blocking_value', ['value'], unique=False) - # ### end Alembic commands ### - - -def downgrade() -> None: - # ### commands auto generated by Alembic - please adjust! ### - op.drop_table('mpi_blocking_value') - op.drop_table('mpi_blocking_key') - op.create_table('mpi_blocking_key', - sa.Column('id', sa.Integer(), nullable=False), - sa.Column('patient_id', sa.Integer(), nullable=False), - sa.Column('key', sa.String(length=50), nullable=False), - sa.Column('value', sa.String(length=50), nullable=False), - sa.ForeignKeyConstraint(['patient_id'], ['mpi_patient.id'], ), - sa.PrimaryKeyConstraint('id') - ) - op.create_index(op.f('ix_mpi_blocking_key_key'), 'mpi_blocking_key', ['key'], unique=False) - op.create_index(op.f('ix_mpi_blocking_key_value'), 'mpi_blocking_key', ['value'], unique=False) - # ### end Alembic commands ### diff --git a/alembic/versions/64ed9566f189_external_person_id_should_be_nullable.py b/alembic/versions/64ed9566f189_external_person_id_should_be_nullable.py deleted file mode 100644 index 8a370fbd..00000000 --- a/alembic/versions/64ed9566f189_external_person_id_should_be_nullable.py +++ /dev/null @@ -1,38 +0,0 @@ -"""external person id should be nullable - -Revision ID: 64ed9566f189 -Revises: bfbd015ca466 -Create Date: 2024-09-18 20:22:07.510203 - -""" -from typing import Sequence, Union - -from alembic import op -import sqlalchemy as sa - - -# revision identifiers, used by Alembic. -revision: str = '64ed9566f189' -down_revision: Union[str, None] = 'bfbd015ca466' -branch_labels: Union[str, Sequence[str], None] = None -depends_on: Union[str, Sequence[str], None] = None - - -def upgrade() -> None: - # ### commands auto generated by Alembic - please adjust! ### - op.drop_index('ix_mpi_blocking_value_value', table_name='mpi_blocking_value') - op.drop_column('mpi_patient', 'external_person_source') - op.drop_column('mpi_patient', 'external_person_id') - op.add_column('mpi_patient', sa.Column('external_person_id', sa.String(length=255), nullable=True)) - op.add_column('mpi_patient', sa.Column('external_person_source', sa.String(length=100), nullable=True)) - # ### end Alembic commands ### - - -def downgrade() -> None: - # ### commands auto generated by Alembic - please adjust! ### - op.drop_column('mpi_patient', 'external_person_source') - op.drop_column('mpi_patient', 'external_person_id') - op.add_column('mpi_patient', sa.Column('external_person_id', sa.String(length=255), nullable=False)) - op.add_column('mpi_patient', sa.Column('external_person_source', sa.String(length=100), nullable=False)) - op.create_index('ix_mpi_blocking_value_value', 'mpi_blocking_value', ['value'], unique=False) - # ### end Alembic commands ### diff --git a/alembic/versions/773a9759734c_redesign_mpi_schema.py b/alembic/versions/773a9759734c_redesign_mpi_schema.py deleted file mode 100644 index 25bf0ac4..00000000 --- a/alembic/versions/773a9759734c_redesign_mpi_schema.py +++ /dev/null @@ -1,64 +0,0 @@ -"""redesign mpi schema - -Revision ID: 773a9759734c -Revises: -Create Date: 2024-09-11 15:34:14.163676 - -""" -from typing import Sequence, Union - -from alembic import op -import sqlalchemy as sa - - -# revision identifiers, used by Alembic. -revision: str = '773a9759734c' -down_revision: Union[str, None] = None -branch_labels: Union[str, Sequence[str], None] = None -depends_on: Union[str, Sequence[str], None] = None - - -def upgrade() -> None: - # ### commands auto generated by Alembic - please adjust! ### - op.create_table('mpi_person', - sa.Column('id', sa.Integer(), nullable=False), - sa.Column('internal_id', sa.Uuid(), nullable=False), - sa.PrimaryKeyConstraint('id') - ) - op.create_table('mpi_external_person', - sa.Column('id', sa.Integer(), nullable=False), - sa.Column('person_id', sa.Integer(), nullable=False), - sa.Column('external_id', sa.String(length=255), nullable=False), - sa.Column('source', sa.String(length=255), nullable=False), - sa.ForeignKeyConstraint(['person_id'], ['mpi_person.id'], ), - sa.PrimaryKeyConstraint('id') - ) - op.create_table('mpi_patient', - sa.Column('id', sa.Integer(), nullable=False), - sa.Column('person_id', sa.Integer(), nullable=False), - sa.Column('data', sa.JSON(), nullable=False), - sa.ForeignKeyConstraint(['person_id'], ['mpi_person.id'], ), - sa.PrimaryKeyConstraint('id') - ) - op.create_table('mpi_blocking_key', - sa.Column('id', sa.Integer(), nullable=False), - sa.Column('patient_id', sa.Integer(), nullable=False), - sa.Column('key', sa.String(length=50), nullable=False), - sa.Column('value', sa.String(length=50), nullable=False), - sa.ForeignKeyConstraint(['patient_id'], ['mpi_patient.id'], ), - sa.PrimaryKeyConstraint('id') - ) - op.create_index(op.f('ix_mpi_blocking_key_key'), 'mpi_blocking_key', ['key'], unique=False) - op.create_index(op.f('ix_mpi_blocking_key_value'), 'mpi_blocking_key', ['value'], unique=False) - # ### end Alembic commands ### - - -def downgrade() -> None: - # ### commands auto generated by Alembic - please adjust! ### - op.drop_index(op.f('ix_mpi_blocking_key_value'), table_name='mpi_blocking_key') - op.drop_index(op.f('ix_mpi_blocking_key_key'), table_name='mpi_blocking_key') - op.drop_table('mpi_blocking_key') - op.drop_table('mpi_patient') - op.drop_table('mpi_external_person') - op.drop_table('mpi_person') - # ### end Alembic commands ### diff --git a/alembic/versions/a30b8bbccfdf_add_dibbs_algorithms.py b/alembic/versions/a30b8bbccfdf_add_dibbs_algorithms.py deleted file mode 100644 index fb07bdc0..00000000 --- a/alembic/versions/a30b8bbccfdf_add_dibbs_algorithms.py +++ /dev/null @@ -1,130 +0,0 @@ -"""add dibbs algorithms - -Revision ID: a30b8bbccfdf -Revises: d9eba1bdbad1 -Create Date: 2024-09-26 15:10:15.179656 - -""" - -from typing import Sequence -from typing import Union - -import sqlalchemy as sa - -from alembic import op -from recordlinker.models import Algorithm -from recordlinker.models import AlgorithmPass -from recordlinker.models import BlockingKey - -# revision identifiers, used by Alembic. -revision: str = 'a30b8bbccfdf' -down_revision: Union[str, None] = 'd9eba1bdbad1' -branch_labels: Union[str, Sequence[str], None] = None -depends_on: Union[str, Sequence[str], None] = None - -FUZZY_THRESHOLDS = { - "first_name": 0.9, - "last_name": 0.9, - "birthdate": 0.95, - "address": 0.9, - "city": 0.92, - "zip": 0.95, -} - -LOG_ODDS_SCORES = { - "address": 8.438284928858774, - "birthdate": 10.126641103800338, - "city": 2.438553006137189, - "first_name": 6.849475906891162, - "last_name": 6.350720397426025, - "mrn": 0.3051262572525359, - "sex": 0.7510419059643679, - "state": 0.022376768992488694, - "zip": 4.975031471124867, -} - -DIBBS_BASIC = { - "id": 1, - "is_default": True, - "label": "DIBBS_BASIC", - "description": "Compares the fields of two records using string similarity scoring. If similarity score is above fuzzy threshold then the fields agree. If all fields being considered agree, then the records are a match." -} - -DIBBS_ENHANCED = { - "id": 2, - "is_default": False, - "label": "DIBBS_ENHANCED", - "description": "Similair to the basic algorithm with the addition of log odds scoring. String comparison scores are multiplied by unique scoring weights for each field. If the sum of all considered weights is greater than a threshold then the records are a match." -} - -DIBBS_BASIC_PASS_ONE = { - "id": 1, - "algorithm_id": 1, - "blocking_keys": [BlockingKey.BIRTHDATE.name, BlockingKey.MRN.name, BlockingKey.SEX.name], - "evaluators": {"first_name": "func:recordlinker.linking.matchers.feature_match_fuzzy_string", "last_name": "func:recordlinker.linking.matchers.feature_match_exact"}, - "rule": "func:recordlinker.linking.matchers.eval_perfect_match", - "cluster_ratio": 0.9, - "kwargs": {"thresholds": FUZZY_THRESHOLDS} -} - -DIBBS_BASIC_PASS_TWO = { - "id": 2, - "algorithm_id": 1, - "blocking_keys": [BlockingKey.ZIP.name, BlockingKey.FIRST_NAME.name, BlockingKey.LAST_NAME.name, BlockingKey.SEX.name], - "evaluators": {"address": "func:recordlinker.linking.matchers.feature_match_fuzzy_string", "birthdate": "func:recordlinker.linking.matchers.feature_match_exact"}, - "rule": "func:recordlinker.linking.matchers.eval_perfect_match", - "cluster_ratio": 0.9, - "kwargs": {"thresholds": FUZZY_THRESHOLDS} -} - -DIBBS_ENHANCED_PASS_ONE = { - "id": 3, - "algorithm_id": 2, - "blocking_keys": [BlockingKey.BIRTHDATE.name, BlockingKey.MRN.name, BlockingKey.SEX.name], - "evaluators": {"first_name": "func:recordlinker.linking.matchers.feature_match_log_odds_fuzzy_compare", "last_name": "func:recordlinker.linking.matchers.feature_match_log_odds_fuzzy_compare"}, - "rule": "func:recordlinker.linking.matchers.eval_log_odds_cutoff", - "cluster_ratio": 0.9, - "kwargs": { - "similarity_measure": "JaroWinkler", - "thresholds": FUZZY_THRESHOLDS, - "true_match_threshold": 12.2, - "log_odds": LOG_ODDS_SCORES, - } -} - -DIBBS_ENHANCED_PASS_TWO = { - "id": 4, - "algorithm_id": 2, - "blocking_keys": [BlockingKey.ZIP.name, BlockingKey.FIRST_NAME.name, BlockingKey.LAST_NAME.name, BlockingKey.SEX.name], - "evaluators": {"address": "func:recordlinker.linking.matchers.feature_match_log_odds_fuzzy_compare", "birthdate": "func:recordlinker.linking.matchers.feature_match_log_odds_fuzzy_compare"}, - "rule": "func:recordlinker.linking.matchers.eval_log_odds_cutoff", - "cluster_ratio": 0.9, - "kwargs": { - "similarity_measure": "JaroWinkler", - "thresholds": FUZZY_THRESHOLDS, - "true_match_threshold": 17.0, - "log_odds": LOG_ODDS_SCORES, - } -} - -def upgrade() -> None: - #insert alogithms - op.execute(sa.insert(Algorithm).values(DIBBS_BASIC)) - op.execute(sa.insert(Algorithm).values(DIBBS_ENHANCED)) - - # #insert algorithm passes - op.execute(sa.insert(AlgorithmPass).values(DIBBS_BASIC_PASS_ONE)) - op.execute(sa.insert(AlgorithmPass).values(DIBBS_BASIC_PASS_TWO)) - op.execute(sa.insert(AlgorithmPass).values(DIBBS_ENHANCED_PASS_ONE)) - op.execute(sa.insert(AlgorithmPass).values(DIBBS_ENHANCED_PASS_TWO)) - -def downgrade() -> None: - # #delete algorithm pass rows - op.execute(sa.delete(AlgorithmPass).where(AlgorithmPass.id == 1)) - op.execute(sa.delete(AlgorithmPass).where(AlgorithmPass.id == 2)) - op.execute(sa.delete(AlgorithmPass).where(AlgorithmPass.id == 3)) - op.execute(sa.delete(AlgorithmPass).where(AlgorithmPass.id == 4)) - - #delete algorithm rows - op.execute(sa.delete(Algorithm).where(Algorithm.id == 1)) - op.execute(sa.delete(Algorithm).where(Algorithm.id == 2)) diff --git a/alembic/versions/ad18f1d41fad_convert_blockingkey_into_enum.py b/alembic/versions/ad18f1d41fad_convert_blockingkey_into_enum.py deleted file mode 100644 index cbcce96c..00000000 --- a/alembic/versions/ad18f1d41fad_convert_blockingkey_into_enum.py +++ /dev/null @@ -1,57 +0,0 @@ -"""convert BlockingKey into enum - -Revision ID: ad18f1d41fad -Revises: 6052c193a26a -Create Date: 2024-09-17 21:15:37.714595 - -""" -from typing import Sequence, Union - -from alembic import op -import sqlalchemy as sa - - -# revision identifiers, used by Alembic. -revision: str = 'ad18f1d41fad' -down_revision: Union[str, None] = '6052c193a26a' -branch_labels: Union[str, Sequence[str], None] = None -depends_on: Union[str, Sequence[str], None] = None - - -def upgrade() -> None: - # ### commands auto generated by Alembic - please adjust! ### - op.drop_table('mpi_blocking_value') - op.drop_table('mpi_blocking_key') - - op.create_table('mpi_blocking_value', - sa.Column('id', sa.Integer(), nullable=False), - sa.Column('patient_id', sa.Integer(), nullable=False), - sa.Column('blockingkey', sa.Integer(), nullable=False), - sa.Column('value', sa.String(length=50), nullable=False), - sa.ForeignKeyConstraint(['patient_id'], ['mpi_patient.id'], ), - sa.PrimaryKeyConstraint('id') - ) - op.create_index('idx_blocking_value_patient_key_value', 'mpi_blocking_value', ['patient_id', 'blockingkey', 'value'], unique=False) - # ### end Alembic commands ### - - -def downgrade() -> None: - # ### commands auto generated by Alembic - please adjust! ### - op.drop_table('mpi_blocking_value') - - op.create_table('mpi_blocking_key', - sa.Column('id', sa.Integer(), nullable=False), - sa.Column('key', sa.String(length=50), nullable=False), - sa.PrimaryKeyConstraint('id') - ) - op.create_table('mpi_blocking_value', - sa.Column('id', sa.Integer(), nullable=False), - sa.Column('patient_id', sa.Integer(), nullable=False), - sa.Column('blockingkey_id', sa.Integer(), nullable=False), - sa.Column('value', sa.String(length=50), nullable=False), - sa.ForeignKeyConstraint(['blockingkey_id'], ['mpi_blocking_key.id'], ), - sa.ForeignKeyConstraint(['patient_id'], ['mpi_patient.id'], ), - sa.PrimaryKeyConstraint('id') - ) - op.create_index(op.f('ix_mpi_blocking_value_value'), 'mpi_blocking_value', ['value'], unique=False) - # ### end Alembic commands ### diff --git a/alembic/versions/bfbd015ca466_moving_externalperson_to_patient_table.py b/alembic/versions/bfbd015ca466_moving_externalperson_to_patient_table.py deleted file mode 100644 index 800e44b3..00000000 --- a/alembic/versions/bfbd015ca466_moving_externalperson_to_patient_table.py +++ /dev/null @@ -1,43 +0,0 @@ -"""moving ExternalPerson to Patient table - -Revision ID: bfbd015ca466 -Revises: ad18f1d41fad -Create Date: 2024-09-18 20:10:30.193941 - -""" -from typing import Sequence, Union - -from alembic import op -import sqlalchemy as sa - - -# revision identifiers, used by Alembic. -revision: str = 'bfbd015ca466' -down_revision: Union[str, None] = 'ad18f1d41fad' -branch_labels: Union[str, Sequence[str], None] = None -depends_on: Union[str, Sequence[str], None] = None - - -def upgrade() -> None: - # ### commands auto generated by Alembic - please adjust! ### - op.drop_table('mpi_external_person') - op.create_index(op.f('ix_mpi_blocking_value_value'), 'mpi_blocking_value', ['value'], unique=False) - op.add_column('mpi_patient', sa.Column('external_person_id', sa.String(length=255), nullable=False)) - op.add_column('mpi_patient', sa.Column('external_person_source', sa.String(length=100), nullable=False)) - # ### end Alembic commands ### - - -def downgrade() -> None: - # ### commands auto generated by Alembic - please adjust! ### - op.drop_column('mpi_patient', 'external_person_source') - op.drop_column('mpi_patient', 'external_person_id') - op.drop_index(op.f('ix_mpi_blocking_value_value'), table_name='mpi_blocking_value') - op.create_table('mpi_external_person', - sa.Column('id', sa.INTEGER(), nullable=False), - sa.Column('person_id', sa.INTEGER(), nullable=False), - sa.Column('external_id', sa.VARCHAR(length=255), nullable=False), - sa.Column('source', sa.VARCHAR(length=255), nullable=False), - sa.ForeignKeyConstraint(['person_id'], ['mpi_person.id'], ), - sa.PrimaryKeyConstraint('id') - ) - # ### end Alembic commands ### diff --git a/alembic/versions/d9eba1bdbad1_recreating_mpi_schema.py b/alembic/versions/d9eba1bdbad1_recreating_mpi_schema.py deleted file mode 100644 index d0037db3..00000000 --- a/alembic/versions/d9eba1bdbad1_recreating_mpi_schema.py +++ /dev/null @@ -1,56 +0,0 @@ -"""recreating mpi schema - -Revision ID: d9eba1bdbad1 -Revises: f5716ac94693 -Create Date: 2024-09-25 21:17:35.127908 - -""" -from typing import Sequence, Union - -from alembic import op -import sqlalchemy as sa - - -# revision identifiers, used by Alembic. -revision: str = 'd9eba1bdbad1' -down_revision: Union[str, None] = 'f5716ac94693' -branch_labels: Union[str, Sequence[str], None] = None -depends_on: Union[str, Sequence[str], None] = None - - -def upgrade() -> None: - # ### commands auto generated by Alembic - please adjust! ### - op.create_table('mpi_person', - sa.Column('id', sa.BigInteger().with_variant(sa.INTEGER(), 'sqlite'), autoincrement=True, nullable=False), - sa.Column('internal_id', sa.Uuid(), nullable=False), - sa.PrimaryKeyConstraint('id') - ) - op.create_table('mpi_patient', - sa.Column('id', sa.BigInteger().with_variant(sa.INTEGER(), 'sqlite'), autoincrement=True, nullable=False), - sa.Column('person_id', sa.BigInteger().with_variant(sa.INTEGER(), 'sqlite'), nullable=False), - sa.Column('data', sa.JSON(), nullable=False), - sa.Column('external_patient_id', sa.String(length=255), nullable=True), - sa.Column('external_person_id', sa.String(length=255), nullable=True), - sa.Column('external_person_source', sa.String(length=100), nullable=True), - sa.ForeignKeyConstraint(['person_id'], ['mpi_person.id'], ), - sa.PrimaryKeyConstraint('id') - ) - op.create_table('mpi_blocking_value', - sa.Column('id', sa.BigInteger().with_variant(sa.INTEGER(), 'sqlite'), autoincrement=True, nullable=False), - sa.Column('patient_id', sa.BigInteger().with_variant(sa.INTEGER(), 'sqlite'), nullable=False), - sa.Column('blockingkey', sa.SmallInteger(), nullable=False), - sa.Column('value', sa.String(length=20), nullable=False), - sa.ForeignKeyConstraint(['patient_id'], ['mpi_patient.id'], ), - sa.PrimaryKeyConstraint('id') - ) - op.create_index('idx_blocking_value_patient_key_value', 'mpi_blocking_value', ['patient_id', 'blockingkey', 'value'], unique=False) - # ### end Alembic commands ### - - -def downgrade() -> None: - # ### commands auto generated by Alembic - please adjust! ### - op.drop_index('idx_blocking_value_patient_key_value', table_name='mpi_blocking_value') - op.drop_table('mpi_blocking_value') - op.drop_table('mpi_patient') - op.drop_table('mpi_person') - # ### end Alembic commands ### diff --git a/alembic/versions/f5716ac94693_dropping_mpi_schema.py b/alembic/versions/f5716ac94693_dropping_mpi_schema.py deleted file mode 100644 index 968ededd..00000000 --- a/alembic/versions/f5716ac94693_dropping_mpi_schema.py +++ /dev/null @@ -1,55 +0,0 @@ -"""dropping mpi schema - -Revision ID: f5716ac94693 -Revises: 0c90faa0378f -Create Date: 2024-09-25 15:47:22.497271 - -""" -from typing import Sequence, Union - -from alembic import op -import sqlalchemy as sa -from sqlalchemy.dialects import sqlite - -# revision identifiers, used by Alembic. -revision: str = 'f5716ac94693' -down_revision: Union[str, None] = '0c90faa0378f' -branch_labels: Union[str, Sequence[str], None] = None -depends_on: Union[str, Sequence[str], None] = None - - -def upgrade() -> None: - # ### commands auto generated by Alembic - please adjust! ### - op.drop_index('idx_blocking_value_patient_key_value', table_name='mpi_blocking_value') - op.drop_table('mpi_blocking_value') - op.drop_table('mpi_patient') - op.drop_table('mpi_person') - # ### end Alembic commands ### - - -def downgrade() -> None: - # ### commands auto generated by Alembic - please adjust! ### - op.create_table('mpi_person', - sa.Column('id', sa.INTEGER(), nullable=False), - sa.Column('internal_id', sa.CHAR(length=32), nullable=False), - sa.PrimaryKeyConstraint('id') - ) - op.create_table('mpi_patient', - sa.Column('id', sa.INTEGER(), nullable=False), - sa.Column('person_id', sa.INTEGER(), nullable=False), - sa.Column('data', sqlite.JSON(), nullable=False), - sa.Column('external_person_id', sa.VARCHAR(length=255), nullable=True), - sa.Column('external_person_source', sa.VARCHAR(length=100), nullable=True), - sa.ForeignKeyConstraint(['person_id'], ['mpi_person.id'], ), - sa.PrimaryKeyConstraint('id') - ) - op.create_table('mpi_blocking_value', - sa.Column('id', sa.INTEGER(), nullable=False), - sa.Column('patient_id', sa.INTEGER(), nullable=False), - sa.Column('blockingkey', sa.INTEGER(), nullable=False), - sa.Column('value', sa.VARCHAR(length=50), nullable=False), - sa.ForeignKeyConstraint(['patient_id'], ['mpi_patient.id'], ), - sa.PrimaryKeyConstraint('id') - ) - op.create_index('idx_blocking_value_patient_key_value', 'mpi_blocking_value', ['patient_id', 'blockingkey', 'value'], unique=False) - # ### end Alembic commands ### diff --git a/assets/general/patient_resource_w_extensions.json b/assets/general/patient_resource_w_extensions.json deleted file mode 100644 index e3c5b6b8..00000000 --- a/assets/general/patient_resource_w_extensions.json +++ /dev/null @@ -1,172 +0,0 @@ -{ - "resourceType": "Patient", - "identifier": [ - { - "value": "123456", - "type": { - "coding": [ - { - "code": "MR", - "system": "http://terminology.hl7.org/CodeSystem/v2-0203", - "display": "Medical record number" - } - ] - }, - "system": "urn...no idea" - } - ], - "name": [ - { - "family": "doe", - "given": [ - "John ", - " Danger " - ], - "use": "official" - } - ], - "birthDate": "1983-02-01", - "gender": "female", - "extension": [ - { - "url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-race", - "extension": [ - { - "url": "ombCategory", - "valueCoding": { - "code": "2106-3", - "display": "White" - } - }, - { - "url": "text", - "valueString": "White" - } - ] - }, - { - "url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-ethnicity", - "extension": [ - { - "url": "ombCategory", - "valueCoding": { - "code": "2135-2", - "display": "Hispanic or Latino" - } - }, - { - "url": "text", - "valueString": "Hispanic or Latino" - } - ] - }, - { - "url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-birthsex", - "extension": [ - { - "url": "value", - "valueCodeableConcept": { - "coding": [ - { - "code": "M", - "display": "Male", - "system": "urn:oid:2.16.840.1.113883.5.1" - } - ] - } - } - ] - }, - { - "url": "http://hl7.org/fhir/us/ecr/StructureDefinition/us-ph-genderidentity-extension", - "extension": [ - { - "url": "value", - "valueCodeableConcept": { - "coding": [ - { - "code": "446141000124107", - "display": "Identifies as female gender (finding)", - "system": "http://snomed.info/sct" - } - ] - } - } - ] - }, - { - "url": "http://hl7.org/fhir/us/ecr/StructureDefinition/us-ph-tribal-affiliation-extension", - "extension": [ - { - "url": "TribeName", - "valueCoding": { - "code": "91", - "display": "Fort Mojave Indian Tribe of Arizona, California", - "system": "http://terminology.hl7.org/CodeSystem/v3-TribalEntityUS" - } - }, - { - "url": "EnrolledTribeMember", - "valueBoolean": true - } - ] - }, - { - "url": "http://hl7.org/fhir/us/ecr/StructureDefinition/us-ph-tribal-affiliation-extension", - "extension": [ - { - "url": "TribeName", - "valueCoding": { - "code": "338", - "display": "Native Village of Afognak", - "system": "http://terminology.hl7.org/CodeSystem/v3-TribalEntityUS" - } - }, - { - "url": "EnrolledTribeMember", - "valueBoolean": false - } - ] - } - ], - "address": [ - { - "line": [ - "123 Fake St", - "Unit #F" - ], - "BuildingNumber": "123", - "city": "Faketon", - "state": "NY", - "postalCode": "10001-0001", - "country": "USA", - "use": "home", - "extension": [ - { - "url": "http://hl7.org/fhir/StructureDefinition/geolocation", - "extension": [ - { - "url": "latitude", - "valueDecimal": 34.58002 - }, - { - "url": "longitude", - "valueDecimal": -118.08925 - } - ] - } - ] - } - ], - "telecom": [ - { - "use": "home", - "system": "phone", - "value": "123-456-7890" - }, - { - "value": "johndanger@doe.net", - "system": "email" - } - ] -} diff --git a/assets/initial_algorithms.json b/assets/initial_algorithms.json new file mode 100644 index 00000000..51c40876 --- /dev/null +++ b/assets/initial_algorithms.json @@ -0,0 +1,136 @@ +[ + { + "label": "dibbs-basic", + "description": "The DIBBs Default Algorithm. Based on field experimentation and statistical analysis, this deterministic two-pass algorithm combines geographical and personal information to maximize linkage quality while minimizing false positives", + "is_default": true, + "passes": [ + { + "blocking_keys": [ + "BIRTHDATE", + "MRN", + "SEX" + ], + "evaluators": { + "FIRST_NAME": "func:recordlinker.linking.matchers.feature_match_fuzzy_string", + "LAST_NAME": "func:recordlinker.linking.matchers.feature_match_exact" + }, + "rule": "func:recordlinker.linking.matchers.eval_perfect_match", + "cluster_ratio": 0.9, + "kwargs": { + "thresholds": { + "FIRST_NAME": 0.9, + "LAST_NAME": 0.9, + "BIRTHDATE": 0.95, + "ADDRESS": 0.9, + "CITY": 0.92, + "ZIP": 0.95 + } + } + }, + { + "blocking_keys": [ + "ZIP", + "FIRST_NAME", + "LAST_NAME", + "SEX" + ], + "evaluators": { + "ADDRESS": "func:recordlinker.linking.matchers.feature_match_fuzzy_string", + "BIRTHDATE": "func:recordlinker.linking.matchers.feature_match_exact" + }, + "rule": "func:recordlinker.linking.matchers.eval_perfect_match", + "cluster_ratio": 0.9, + "kwargs": { + "thresholds": { + "FIRST_NAME": 0.9, + "LAST_NAME": 0.9, + "BIRTHDATE": 0.95, + "ADDRESS": 0.9, + "CITY": 0.92, + "ZIP": 0.95 + } + } + } + ] + }, + { + "label": "dibbs-enhanced", + "description": "The DIBBs Log-Odds Algorithm. This optional algorithm uses statistical correction to adjust the links between incoming records and previously processed patients (it does so by taking advantage of the fact that some fields are more informative than others—e.g., two records matching on MRN is stronger evidence that they should be linked than if the records matched on zip code). It can be used if additional granularity in matching links is desired. However, while the DIBBs Log-Odds Algorithm can create higher-quality links, it is dependent on statistical updating and pre-calculated population analysis, which requires some work on the part of the user. For those cases where additional precision or stronger matching criteria are required, the Log-Odds algorithm is detailed below.", + "is_default": false, + "passes": [ + { + "blocking_keys": [ + "BIRTHDATE", + "MRN", + "SEX" + ], + "evaluators": { + "FIRST_NAME": "func:recordlinker.linking.matchers.feature_match_log_odds_fuzzy_compare", + "LAST_NAME": "func:recordlinker.linking.matchers.feature_match_log_odds_fuzzy_compare" + }, + "rule": "func:recordlinker.linking.matchers.eval_log_odds_cutoff", + "cluster_ratio": 0.9, + "kwargs": { + "similarity_measure": "JaroWinkler", + "thresholds": { + "FIRST_NAME": 0.9, + "LAST_NAME": 0.9, + "BIRTHDATE": 0.95, + "ADDRESS": 0.9, + "CITY": 0.92, + "ZIP": 0.95 + }, + "true_match_threshold": 12.2, + "log_odds": { + "ADDRESS": 8.438284928858774, + "BIRTHDATE": 10.126641103800338, + "CITY": 2.438553006137189, + "FIRST_NAME": 6.849475906891162, + "LAST_NAME": 6.350720397426025, + "MRN": 0.3051262572525359, + "SEX": 0.7510419059643679, + "STATE": 0.022376768992488694, + "ZIP": 4.975031471124867 + } + } + }, + { + "blocking_keys": [ + "ZIP", + "FIRST_NAME", + "LAST_NAME", + "SEX" + ], + "evaluators": { + "ADDRESS": "func:recordlinker.linking.matchers.feature_match_log_odds_fuzzy_compare", + "BIRTHDATE": "func:recordlinker.linking.matchers.feature_match_log_odds_fuzzy_compare" + }, + "rule": "func:recordlinker.linking.matchers.eval_log_odds_cutoff", + "cluster_ratio": 0.9, + "kwargs": { + "similarity_measure": "JaroWinkler", + "thresholds": { + "FIRST_NAME": 0.9, + "LAST_NAME": 0.9, + "BIRTHDATE": 0.95, + "ADDRESS": 0.9, + "CITY": 0.92, + "ZIP": 0.95 + }, + "true_match_threshold": 17.0, + "log_odds": { + "ADDRESS": 8.438284928858774, + "BIRTHDATE": 10.126641103800338, + "CITY": 2.438553006137189, + "FIRST_NAME": 6.849475906891162, + "LAST_NAME": 6.350720397426025, + "MRN": 0.3051262572525359, + "SEX": 0.7510419059643679, + "STATE": 0.022376768992488694, + "ZIP": 4.975031471124867 + } + } + } + ] + } +] diff --git a/assets/linking/basic_algorithm.json b/assets/linking/basic_algorithm.json deleted file mode 100644 index 6f76a859..00000000 --- a/assets/linking/basic_algorithm.json +++ /dev/null @@ -1,68 +0,0 @@ -{ - "algorithm": [ - { - "funcs": { - "first_name": "func:recordlinker.linking.matchers.feature_match_fuzzy_string", - "last_name": "func:recordlinker.linking.matchers.feature_match_exact" - }, - "blocks": [ - { - "value": "birthdate" - }, - { - "value": "mrn", - "transformation": "last4" - }, - { - "value": "sex" - } - ], - "matching_rule": "func:recordlinker.linking.matchers.eval_perfect_match", - "cluster_ratio": 0.9, - "kwargs": { - "thresholds": { - "first_name": 0.9, - "last_name": 0.9, - "birthdate": 0.95, - "address": 0.9, - "city": 0.92, - "zip": 0.95 - } - } - }, - { - "funcs": { - "address": "func:recordlinker.linking.matchers.feature_match_fuzzy_string", - "birthdate": "func:recordlinker.linking.matchers.feature_match_exact" - }, - "blocks": [ - { - "value": "zip" - }, - { - "value": "first_name", - "transformation": "first4" - }, - { - "value": "last_name", - "transformation": "first4" - }, - { - "value": "sex" - } - ], - "matching_rule": "func:recordlinker.linking.matchers.eval_perfect_match", - "cluster_ratio": 0.9, - "kwargs": { - "thresholds": { - "first_name": 0.9, - "last_name": 0.9, - "birthdate": 0.95, - "address": 0.9, - "city": 0.92, - "zip": 0.95 - } - } - } - ] -} diff --git a/assets/linking/enhanced_algorithm.json b/assets/linking/enhanced_algorithm.json deleted file mode 100644 index f9335da9..00000000 --- a/assets/linking/enhanced_algorithm.json +++ /dev/null @@ -1,94 +0,0 @@ -{ - "algorithm": [ - { - "funcs": { - "first_name": "func:recordlinker.linking.matchers.feature_match_log_odds_fuzzy_compare", - "last_name": "func:recordlinker.linking.matchers.feature_match_log_odds_fuzzy_compare" - }, - "blocks": [ - { - "value": "birthdate" - }, - { - "value": "mrn", - "transformation": "last4" - }, - { - "value": "sex" - } - ], - "matching_rule": "func:recordlinker.linking.matchers.eval_log_odds_cutoff", - "cluster_ratio": 0.9, - "kwargs": { - "similarity_measure": "JaroWinkler", - "thresholds": { - "first_name": 0.9, - "last_name": 0.9, - "birthdate": 0.95, - "address": 0.9, - "city": 0.92, - "zip": 0.95 - }, - "true_match_threshold": 12.2, - "log_odds": { - "address": 8.438284928858774, - "birthdate": 10.126641103800338, - "city": 2.438553006137189, - "first_name": 6.849475906891162, - "last_name": 6.350720397426025, - "mrn": 0.3051262572525359, - "sex": 0.7510419059643679, - "state": 0.022376768992488694, - "zip": 4.975031471124867 - } - } - }, - { - "funcs": { - "address": "func:recordlinker.linking.matchers.feature_match_log_odds_fuzzy_compare", - "birthdate": "func:recordlinker.linking.matchers.feature_match_log_odds_fuzzy_compare" - }, - "blocks": [ - { - "value": "zip" - }, - { - "value": "first_name", - "transformation": "first4" - }, - { - "value": "last_name", - "transformation": "first4" - }, - { - "value": "sex" - } - ], - "matching_rule": "func:recordlinker.linking.matchers.eval_log_odds_cutoff", - "cluster_ratio": 0.9, - "kwargs": { - "similarity_measure": "JaroWinkler", - "thresholds": { - "first_name": 0.9, - "last_name": 0.9, - "birthdate": 0.95, - "address": 0.9, - "city": 0.92, - "zip": 0.95 - }, - "true_match_threshold": 17.0, - "log_odds": { - "address": 8.438284928858774, - "birthdate": 10.126641103800338, - "city": 2.438553006137189, - "first_name": 6.849475906891162, - "last_name": 6.350720397426025, - "mrn": 0.3051262572525359, - "sex": 0.7510419059643679, - "state": 0.022376768992488694, - "zip": 4.975031471124867 - } - } - } - ] -} diff --git a/assets/patient_bundle_to_link_with_mpi.json b/assets/patient_bundle_to_link_with_mpi.json deleted file mode 100644 index f98b1857..00000000 --- a/assets/patient_bundle_to_link_with_mpi.json +++ /dev/null @@ -1,559 +0,0 @@ -{ - "resourceType": "Bundle", - "identifier": { - "value": "a very contrived FHIR bundle" - }, - "entry": [ - { - "resource": { - "resourceType": "Patient", - "id": "f6a16ff7-4a31-11eb-be7b-8344edc8f36b", - "identifier": [ - { - "value": "1234567890", - "type": { - "coding": [ - { - "code": "MR", - "system": "http://terminology.hl7.org/CodeSystem/v2-0203", - "display": "Medical record number" - } - ] - } - } - ], - "name": [ - { - "family": "Shepard", - "given": [ - "John" - ], - "use": "official" - } - ], - "birthDate": "2053-11-07", - "gender": "male", - "address": [ - { - "line": [ - "1234 Silversun Strip" - ], - "buildingNumber": "1234", - "city": "Boston", - "state": "Massachusetts", - "postalCode": "99999", - "use": "home" - } - ], - "telecom": [ - { - "use": "home", - "system": "phone", - "value": "123-456-7890" - } - ] - } - }, - { - "resource": { - "resourceType": "Patient", - "id": "2fdd0b8b-4a70-11eb-99fd-ad786a821574", - "identifier": [ - { - "value": "1234567890", - "type": { - "coding": [ - { - "code": "MR", - "system": "http://terminology.hl7.org/CodeSystem/v2-0203", - "display": "Medical record number" - } - ] - } - } - ], - "name": [ - { - "family": "Shepard", - "given": [ - "Jon" - ], - "use": "official" - } - ], - "birthDate": "2053-11-07", - "gender": "male", - "address": [ - { - "line": [ - "1234 Silversun Strip" - ], - "buildingNumber": "1234", - "city": "Boston", - "state": "Massachusetts", - "postalCode": "99999", - "use": "home" - } - ], - "telecom": [ - { - "use": "home", - "system": "phone", - "value": "123-456-7890" - } - ] - } - }, - { - "resource": { - "resourceType": "Patient", - "id": "2c6d5fd1-4a70-11eb-99fd-ad786a821574", - "identifier": [ - { - "value": "7894561235", - "type": { - "coding": [ - { - "code": "MR", - "system": "http://terminology.hl7.org/CodeSystem/v2-0203", - "display": "Medical record number" - } - ] - } - } - ], - "name": [ - { - "family": "Vas Normandy", - "given": [ - "Tali", - "Zora" - ], - "use": "official" - } - ], - "birthDate": "2060-05-14", - "gender": "female", - "address": [ - { - "line": [ - "PO Box 1", - "First Rock" - ], - "city": "Bozeman", - "state": "Montana", - "postalCode": "11111", - "use": "home" - } - ] - } - }, - { - "resource": { - "resourceType": "Patient", - "id": "fd645c21-4a6f-11eb-99fd-ad786a821574", - "identifier": [ - { - "value": "7845451380", - "type": { - "coding": [ - { - "code": "MR", - "system": "http://terminology.hl7.org/CodeSystem/v2-0203", - "display": "Medical record number" - } - ] - } - } - ], - "name": [ - { - "family": "Shepard", - "given": [ - "John" - ], - "use": "official" - } - ], - "birthDate": "2053-11-07", - "gender": "male", - "address": [ - { - "line": [ - "1234 Silversun Strip" - ], - "buildingNumber": "1234", - "city": "Boston", - "state": "Massachusetts", - "postalCode": "99999", - "use": "home" - } - ], - "telecom": [ - { - "use": "home", - "system": "phone", - "value": "123-456-7890" - } - ] - } - }, - { - "resource": { - "resourceType": "Patient", - "id": "a81bc81b-dead-4e5d-abff-90865d1e13b1", - "identifier": [ - { - "value": "7894561235", - "type": { - "coding": [ - { - "code": "MR", - "system": "http://terminology.hl7.org/CodeSystem/v2-0203", - "display": "Medical record number" - } - ] - } - } - ], - "name": [ - { - "family": "Shepard", - "given": [ - "John" - ], - "use": "official" - } - ], - "birthDate": "2053-11-07", - "gender": "female", - "address": [ - { - "line": [ - "PO Box 1", - "First Rock" - ], - "city": "Bozeman", - "state": "Montana", - "postalCode": "11111", - "use": "home" - } - ] - } - }, - { - "resource": { - "resourceType": "Patient", - "id": "a0eebc99-9c0b-4ef8-bb6d-6bb9bd380a11", - "identifier": [ - { - "value": "1234567890", - "type": { - "coding": [ - { - "code": "MR", - "system": "http://terminology.hl7.org/CodeSystem/v2-0203", - "display": "Medical record number" - } - ] - } - } - ], - "name": [ - { - "family": "Shepard", - "given": [ - "John" - ], - "use": "official" - } - ], - "birthDate": "2053-11-07", - "gender": "female", - "address": [ - { - "line": [ - "1234 Silversun Strip" - ], - "city": "Bozeman", - "state": "Montana", - "postalCode": "11111", - "use": "home" - } - ] - } - }, - { - "resource": { - "resourceType": "Patient", - "id": "a0eebc99-9c0b-4ef8-bb6d-6bb9bd380b22", - "identifier": [ - { - "value": "1234567890", - "type": { - "coding": [ - { - "code": "MR", - "system": "http://terminology.hl7.org/CodeSystem/v2-0203", - "display": "Medical record number" - } - ] - } - } - ], - "name": [ - { - "family": "Diop", - "given": [ - "Issa", - "Rae" - ], - "use": "official" - }, - { - "family": "Rae", - "given": [ - "Issa" - ], - "use": "official" - } - ], - "birthDate": "1985-01-12", - "gender": "female", - "address": [ - { - "line": [ - "1234 Insecure Road", - "Apt 2" - ], - "city": "Los Angeles", - "state": "California", - "postalCode": "90210", - "use": "home" - }, - { - "line": [ - "1234 Insecure Road", - "Apt 2" - ], - "city": "Los Angeles", - "state": "California", - "postalCode": "90210", - "use": "home" - } - ] - } - }, - { - "resource": { - "resourceType": "Patient", - "id": "a0eebc99-9c0b-4ef8-bb6d-6bb9bd380b23", - "identifier": [ - { - "value": "1234567890", - "type": { - "coding": [ - { - "code": "MR", - "system": "http://terminology.hl7.org/CodeSystem/v2-0203", - "display": "Medical record number" - } - ] - } - } - ], - "name": [ - { - "family": "Rae", - "given": [ - "Issa" - ], - "use": "official" - } - ], - "birthDate": "1985-01-12", - "gender": "female", - "address": [ - { - "line": [ - "1234 Insecure Road", - "Apt 2" - ], - "city": "Los Angeles", - "state": "California", - "postalCode": "90210", - "use": "home" - } - ] - } - }, - { - "resource": { - "resourceType": "Patient", - "id": "a0eebc99-9c0b-4ef8-bb6d-6bb9bd380b24", - "name": [ - { - "family": "Rae", - "given": [ - "Issa" - ], - "use": "official" - } - ], - "birthDate": "1985-01-12", - "gender": "female", - "address": [ - { - "line": [ - "1234 Insecure Road", - "Apt 2" - ], - "city": "Los Angeles", - "state": "California", - "postalCode": "90210", - "use": "home" - } - ] - } - }, - { - "resource": { - "resourceType": "Patient", - "id": "a0eebc99-9c0b-4ef8-bb6d-6bb9bd380b25", - "name": [ - { - "family": "Rae", - "given": [ - "Issa" - ], - "use": "official" - } - ], - "birthDate": "1985-01-12", - "gender": "female", - "address": [ - { - "line": [ - "231 Main Street", - "Apt 567" - ], - "city": "Los Angeles", - "state": "California", - "postalCode": "90211", - "use": "home" - }, - { - "line": [ - "1234 Insecure Road", - "Apt 2" - ], - "city": "Los Angeles", - "state": "California", - "postalCode": "90210", - "use": "home" - } - ] - } - }, - { - "resource": { - "resourceType": "Patient", - "id": "a0eebc99-9c0b-4ef8-bb6d-6bb9bd380b26", - "identifier": [ - { - "value": "2345678901", - "type": { - "coding": [ - { - "code": "MR", - "system": "http://terminology.hl7.org/CodeSystem/v2-0203", - "display": "Medical record number" - } - ] - } - } - ], - "name": [ - { - "family": "Roberts", - "given": [ - "Chris" - ], - "use": "official" - } - ], - "birthDate": "2004-11-24", - "gender": "male", - "address": [ - { - "line": [ - "18 Broadway" - ], - "city": "Joneston", - "state": "Colorado", - "postalCode": "55555", - "use": "home" - } - ] - } - }, - { - "resource": { - "resourceType": "Patient", - "id": "a0eebc99-9c0b-4ef8-bb6d-6bb9bd380b73", - "identifier": [ - { - "value": "2345678901", - "type": { - "coding": [ - { - "code": "MR", - "system": "http://terminology.hl7.org/CodeSystem/v2-0203", - "display": "Medical record number" - } - ] - } - } - ], - "name": [ - { - "family": "Roberts", - "given": [ - "Christopher" - ], - "use": "official" - } - ], - "birthDate": "2004-11-24", - "gender": "male", - "address": [ - { - "line": [ - "1234 Insecure Road", - "Apt 2" - ], - "city": "Los Angeles", - "state": "California", - "postalCode": "90210", - "use": "home" - }, - { - "line": [ - "18 Broadway" - ], - "city": "Joneston", - "state": "Colorado", - "postalCode": "55555", - "use": "home" - } - ] - } - }, - { - "request": { - "method": "GET", - "url": "testing for entry with no resource" - } - } - ] -} diff --git a/pyproject.toml b/pyproject.toml index 1e55d132..dcb1e1de 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,9 +39,9 @@ dev = [ "fastapi[standard]", "pytest>=8.3", "pytest-cov", + "pytest-env", "ruff", "mypy", - "alembic", "types-python-dateutil" ] prod = [ @@ -82,10 +82,13 @@ order-by-type = false [tool.pytest.ini_options] testpaths = ["tests/unit"] -pythonpath = ["src"] +pythonpath = ["src", "tests/unit"] filterwarnings = [ "ignore:typing.io is deprecated, import directly from typing instead:DeprecationWarning" ] +env = [ + "INITIAL_ALGORITHMS=", +] [tool.mypy] mypy_path = ["src"] diff --git a/scripts/local_server.sh b/scripts/local_server.sh index 25796c33..ea28dc2f 100755 --- a/scripts/local_server.sh +++ b/scripts/local_server.sh @@ -14,4 +14,4 @@ cd "$(dirname "$0")/.." PORT=${1:-8000} # Start the API server -uvicorn recordlinker.main:app --app-dir src --reload --host 0 --port ${PORT} --log-config src/recordlinker/log_config.yml +uvicorn recordlinker.main:app --app-dir src --reload --reload-dir src/ --host 0 --port ${PORT} --log-config src/recordlinker/log_config.yml diff --git a/src/recordlinker/config.py b/src/recordlinker/config.py index b3436cb6..608eb539 100644 --- a/src/recordlinker/config.py +++ b/src/recordlinker/config.py @@ -4,6 +4,14 @@ import pydantic_settings +class ConfigurationError(Exception): + """ + Error raised when there is a configuration issue. + """ + + pass + + class Settings(pydantic_settings.BaseSettings): model_config = pydantic_settings.SettingsConfigDict( env_file=".env", @@ -25,6 +33,14 @@ class Settings(pydantic_settings.BaseSettings): "above the connection pool size", default=10, ) + initial_algorithms: str = pydantic.Field( + description=( + "The path to the initial algorithms file that is loaded on startup if the " + "algorithms table is empty. This file should be in JSON format. If the " + "value is an empty string, no algorithms will be loaded." + ), + default="assets/initial_algorithms.json", + ) settings = Settings() # type: ignore diff --git a/src/recordlinker/linking/algorithm_service.py b/src/recordlinker/linking/algorithm_service.py index da29e35d..ae014ab6 100644 --- a/src/recordlinker/linking/algorithm_service.py +++ b/src/recordlinker/linking/algorithm_service.py @@ -74,7 +74,7 @@ def load_algorithm( algo = data.model_dump() passes = algo.pop("passes") # use the existing Algorithm or create a new one - created = not obj + created = obj is None obj = obj or models.Algorithm() # Create and add the Algorithm for key, value in algo.items(): diff --git a/src/recordlinker/linking/link.py b/src/recordlinker/linking/link.py index 0409f43c..5356924b 100644 --- a/src/recordlinker/linking/link.py +++ b/src/recordlinker/linking/link.py @@ -99,10 +99,11 @@ def compare( results: list[float] = [] for field, func in funcs.items(): # TODO: can we do this check earlier? - if field not in {i.value for i in schemas.Feature}: + feature = getattr(schemas.Feature, field, None) + if feature is None: raise ValueError(f"Invalid comparison field: {field}") # Evaluate the comparison function and append the result to the list - result: float = func(record, patient, schemas.Feature(field), **kwargs) # type: ignore + result: float = func(record, patient, feature, **kwargs) # type: ignore results.append(result) return matching_rule(results, **kwargs) # type: ignore diff --git a/src/recordlinker/log_config.yml b/src/recordlinker/log_config.yml index 20efafa1..6f1bf1e2 100644 --- a/src/recordlinker/log_config.yml +++ b/src/recordlinker/log_config.yml @@ -28,7 +28,7 @@ loggers: - access propagate: no root: - level: DEBUG + level: INFO handlers: - default propagate: no diff --git a/src/recordlinker/main.py b/src/recordlinker/main.py index 25d94652..3d006544 100644 --- a/src/recordlinker/main.py +++ b/src/recordlinker/main.py @@ -139,8 +139,9 @@ async def health_check(db_session: orm.Session = Depends(get_session)) -> Health # Sample requests and responses for docs -sample_link_record_requests = utils.read_json_from_assets("sample_link_record_requests.json") -sample_link_record_responses = utils.read_json_from_assets("sample_link_record_responses.json") +# TODO: These assets need to be installed with the python code +sample_link_record_requests = utils.read_json("assets", "sample_link_record_requests.json") +sample_link_record_responses = utils.read_json("assets", "sample_link_record_responses.json") @app.post("/link-record", status_code=200, responses={200: sample_link_record_responses}) diff --git a/src/recordlinker/models/algorithm.py b/src/recordlinker/models/algorithm.py index 4154c24a..d9c5063a 100644 --- a/src/recordlinker/models/algorithm.py +++ b/src/recordlinker/models/algorithm.py @@ -1,3 +1,4 @@ +import logging import typing from sqlalchemy import event @@ -6,9 +7,13 @@ from sqlalchemy import types as sqltypes from recordlinker import utils +from recordlinker.config import ConfigurationError +from recordlinker.config import settings from .base import Base +LOGGER = logging.getLogger(__name__) + class Algorithm(Base): __tablename__ = "algorithm" @@ -21,6 +26,20 @@ class Algorithm(Base): back_populates="algorithm", cascade="all, delete-orphan" ) + @classmethod + def from_dict(cls, **data: dict) -> "Algorithm": + """ + Create an instance of Algorithm from a dictionary. + + Parameters: + data: The dictionary containing the data for the Algorithm instance. + + Returns: + The Algorithm instance. + """ + passes = [AlgorithmPass(**p) for p in data.pop("passes", [])] + return cls(passes=passes, **data) + def check_only_one_default(mapping, connection, target): """ @@ -112,3 +131,35 @@ def bound_rule(self) -> typing.Callable: if not hasattr(self, "_bound_rule"): self._bound_rule = utils.str_to_callable(self.rule) return self._bound_rule + + +@event.listens_for(schema.MetaData, "after_create") +def create_initial_algorithms(target, connection, **kw) -> typing.List[Algorithm] | None: + """ + Create the initial algorithms if they have been defined in the configuration. + This function is called after the database schema has been created in the + recordlinker.database.create_sessionmaker function. + """ + if settings.initial_algorithms: + try: + data = utils.read_json(settings.initial_algorithms) + except Exception as exc: + raise ConfigurationError("Error loading initial algorithms") from exc + if not any(algo.get("is_default") for algo in data): + raise ConfigurationError(f"No default algorithm found in {settings.initial_algorithms}") + + session = orm.Session(bind=connection) + try: + # Only load the algorithms if there are none in the database + if session.query(Algorithm).count() == 0: + objs = [Algorithm.from_dict(**algo) for algo in data] + session.add_all(objs) + session.commit() + LOGGER.info(f"Created {len(objs)} initial algorithms.") + return objs + except Exception as exc: + session.rollback() + raise ConfigurationError("Error creating initial algorithms") from exc + finally: + session.close() + return None diff --git a/src/recordlinker/schemas/pii.py b/src/recordlinker/schemas/pii.py index c2f26397..4397f347 100644 --- a/src/recordlinker/schemas/pii.py +++ b/src/recordlinker/schemas/pii.py @@ -13,15 +13,15 @@ class Feature(enum.Enum): Enum for the different Patient attributes that can be used for comparison. """ - BIRTHDATE = "birthdate" - MRN = "mrn" - SEX = "sex" - FIRST_NAME = "first_name" - LAST_NAME = "last_name" - ADDRESS = "address" - CITY = "city" - STATE = "state" - ZIP = "zip" + BIRTHDATE = "BIRTHDATE" + MRN = "MRN" + SEX = "SEX" + FIRST_NAME = "FIRST_NAME" + LAST_NAME = "LAST_NAME" + ADDRESS = "ADDRESS" + CITY = "CITY" + STATE = "STATE" + ZIP = "ZIP" def __str__(self): """ diff --git a/src/recordlinker/utils.py b/src/recordlinker/utils.py index d27b1489..fa118dfb 100644 --- a/src/recordlinker/utils.py +++ b/src/recordlinker/utils.py @@ -22,12 +22,13 @@ def project_root() -> pathlib.Path: return root -def read_json_from_assets(*filepaths: str) -> dict: +def read_json(*filepaths: str) -> dict: """ - Loads a JSON file from the 'assets' directory. + Loads a JSON file. """ - filename = pathlib.Path(project_root(), "assets", *filepaths) - return json.load(open(filename)) + filename = pathlib.Path(project_root(), *filepaths) + with open(filename, "r") as fobj: + return json.load(fobj) def bind_functions(data: dict) -> dict: diff --git a/tests/unit/assets/initial_algorithms.json b/tests/unit/assets/initial_algorithms.json new file mode 120000 index 00000000..6c335b1e --- /dev/null +++ b/tests/unit/assets/initial_algorithms.json @@ -0,0 +1 @@ +../../../assets/initial_algorithms.json \ No newline at end of file diff --git a/assets/general/patient_bundle.json b/tests/unit/assets/patient_bundle.json similarity index 100% rename from assets/general/patient_bundle.json rename to tests/unit/assets/patient_bundle.json diff --git a/assets/linking/patient_bundle_to_link_with_mpi.json b/tests/unit/assets/simple_patient_bundle_to_link_with_mpi.json similarity index 100% rename from assets/linking/patient_bundle_to_link_with_mpi.json rename to tests/unit/assets/simple_patient_bundle_to_link_with_mpi.json diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index fd3ed035..ab052e4d 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -1,3 +1,7 @@ +import functools +import json +import pathlib + import pytest from fastapi.testclient import TestClient @@ -6,6 +10,16 @@ from recordlinker import models +def load_json_asset(*paths: str) -> dict | list: + """ + Loads a JSON file from the testing 'assets' directory. + """ + cwd = pathlib.Path(__file__).resolve().parent + filename = pathlib.Path(cwd, "assets", *paths) + with open(filename, "r") as fobj: + return json.load(fobj) + + @pytest.fixture(scope="function") def session(): with database.get_test_session() as session: @@ -28,133 +42,17 @@ def client(): yield c +@functools.lru_cache @pytest.fixture def basic_algorithm(): - basic_algo_pass1 = models.AlgorithmPass( - id=1, - algorithm_id=1, - blocking_keys=["BIRTHDATE", "MRN", "SEX"], - evaluators={ - "first_name": "func:recordlinker.linking.matchers.feature_match_fuzzy_string", - "last_name": "func:recordlinker.linking.matchers.feature_match_exact", - }, - rule="func:recordlinker.linking.matchers.eval_perfect_match", - cluster_ratio=0.9, - kwargs={ - "thresholds": { - "first_name": 0.9, - "last_name": 0.9, - "birthdate": 0.95, - "address": 0.9, - "city": 0.92, - "zip": 0.95, - } - }, - ) - basic_algo_pass2 = models.AlgorithmPass( - id=2, - algorithm_id=1, - blocking_keys=["ZIP", "FIRST_NAME", "LAST_NAME", "SEX"], - evaluators={ - "address": "func:recordlinker.linking.matchers.feature_match_fuzzy_string", - "birthdate": "func:recordlinker.linking.matchers.feature_match_exact", - }, - rule="func:recordlinker.linking.matchers.eval_perfect_match", - cluster_ratio=0.9, - kwargs={ - "thresholds": { - "first_name": 0.9, - "last_name": 0.9, - "birthdate": 0.95, - "address": 0.9, - "city": 0.92, - "zip": 0.95, - } - }, - ) - return models.Algorithm( - id=1, - label="dibbs-basic", - is_default=True, - description="First algorithm", - passes=[basic_algo_pass1, basic_algo_pass2], - ) + for algo in load_json_asset("initial_algorithms.json"): + if algo["label"] == "dibbs-basic": + return models.Algorithm.from_dict(**algo) +@functools.lru_cache @pytest.fixture def enhanced_algorithm(): - enhanced_algo_pass1 = models.AlgorithmPass( - id=1, - algorithm_id=1, - blocking_keys=["BIRTHDATE", "MRN", "SEX"], - evaluators={ - "first_name": "func:recordlinker.linking.matchers.feature_match_log_odds_fuzzy_compare", - "last_name": "func:recordlinker.linking.matchers.feature_match_log_odds_fuzzy_compare", - }, - rule="func:recordlinker.linking.matchers.eval_log_odds_cutoff", - cluster_ratio=0.9, - kwargs={ - "similarity_measure": "JaroWinkler", - "thresholds": { - "first_name": 0.9, - "last_name": 0.9, - "birthdate": 0.95, - "address": 0.9, - "city": 0.92, - "zip": 0.95, - }, - "true_match_threshold": 12.2, - "log_odds": { - "address": 8.438284928858774, - "birthdate": 10.126641103800338, - "city": 2.438553006137189, - "first_name": 6.849475906891162, - "last_name": 6.350720397426025, - "mrn": 0.3051262572525359, - "sex": 0.7510419059643679, - "state": 0.022376768992488694, - "zip": 4.975031471124867, - }, - }, - ) - enhanced_algo_pass2 = models.AlgorithmPass( - id=2, - algorithm_id=1, - blocking_keys=["ZIP", "FIRST_NAME", "LAST_NAME", "SEX"], - evaluators={ - "address": "func:recordlinker.linking.matchers.feature_match_log_odds_fuzzy_compare", - "birthdate": "func:recordlinker.linking.matchers.feature_match_log_odds_fuzzy_compare", - }, - rule="func:recordlinker.linking.matchers.eval_log_odds_cutoff", - cluster_ratio=0.9, - kwargs={ - "similarity_measure": "JaroWinkler", - "thresholds": { - "first_name": 0.9, - "last_name": 0.9, - "birthdate": 0.95, - "address": 0.9, - "city": 0.92, - "zip": 0.95, - }, - "true_match_threshold": 17.0, - "log_odds": { - "address": 8.438284928858774, - "birthdate": 10.126641103800338, - "city": 2.438553006137189, - "first_name": 6.849475906891162, - "last_name": 6.350720397426025, - "mrn": 0.3051262572525359, - "sex": 0.7510419059643679, - "state": 0.022376768992488694, - "zip": 4.975031471124867, - }, - }, - ) - return models.Algorithm( - id=1, - label="dibbs-enhanced", - is_default=False, - description="First algorithm", - passes=[enhanced_algo_pass1, enhanced_algo_pass2], - ) + for algo in load_json_asset("initial_algorithms.json"): + if algo["label"] == "dibbs-enhanced": + return models.Algorithm.from_dict(**algo) diff --git a/tests/unit/linking/test_link.py b/tests/unit/linking/test_link.py index 0385e0a2..6782c077 100644 --- a/tests/unit/linking/test_link.py +++ b/tests/unit/linking/test_link.py @@ -10,16 +10,16 @@ import uuid import pytest +from conftest import load_json_asset from recordlinker import models from recordlinker import schemas -from recordlinker import utils from recordlinker.linking import link class TestAddPersonResource: def test_add_person_resource(self): - bundle = utils.read_json_from_assets("general", "patient_bundle.json") + bundle = load_json_asset("patient_bundle.json") raw_bundle = copy.deepcopy(bundle) patient_id = "TEST_PATIENT_ID" person_id = "TEST_PERSON_ID" @@ -32,13 +32,8 @@ def test_add_person_resource(self): assert len(returned_bundle.get("entry")) == len(bundle.get("entry")) + 1 # Assert the added element is the person_resource bundle - assert ( - returned_bundle.get("entry")[-1].get("resource").get("resourceType") == "Person" - ) - assert ( - returned_bundle.get("entry")[-1].get("request").get("url") - == "Person/TEST_PERSON_ID" - ) + assert returned_bundle.get("entry")[-1].get("resource").get("resourceType") == "Person" + assert returned_bundle.get("entry")[-1].get("request").get("url") == "Person/TEST_PERSON_ID" class TestCompare: @@ -68,7 +63,18 @@ def test_compare_match(self): } ) - algorithm_pass = models.AlgorithmPass(id=1, algorithm_id=1, blocking_keys=[1], evaluators={"first_name": "func:recordlinker.linking.matchers.feature_match_exact", "last_name": "func:recordlinker.linking.matchers.feature_match_fuzzy_string"}, rule="func:recordlinker.linking.matchers.eval_perfect_match", cluster_ratio=1.0, kwargs={}) + algorithm_pass = models.AlgorithmPass( + id=1, + algorithm_id=1, + blocking_keys=[1], + evaluators={ + "FIRST_NAME": "func:recordlinker.linking.matchers.feature_match_exact", + "LAST_NAME": "func:recordlinker.linking.matchers.feature_match_fuzzy_string", + }, + rule="func:recordlinker.linking.matchers.eval_perfect_match", + cluster_ratio=1.0, + kwargs={}, + ) assert link.compare(rec, pat, algorithm_pass) is True @@ -97,7 +103,18 @@ def test_compare_no_match(self): ] } ) - algorithm_pass = models.AlgorithmPass(id=1, algorithm_id=1, blocking_keys=[1], evaluators={"first_name": "func:recordlinker.linking.matchers.feature_match_exact", "last_name": "func:recordlinker.linking.matchers.feature_match_exact"}, rule="func:recordlinker.linking.matchers.eval_perfect_match", cluster_ratio=1.0, kwargs={}) + algorithm_pass = models.AlgorithmPass( + id=1, + algorithm_id=1, + blocking_keys=[1], + evaluators={ + "FIRST_NAME": "func:recordlinker.linking.matchers.feature_match_exact", + "LAST_NAME": "func:recordlinker.linking.matchers.feature_match_exact", + }, + rule="func:recordlinker.linking.matchers.eval_perfect_match", + cluster_ratio=1.0, + kwargs={}, + ) assert link.compare(rec, pat, algorithm_pass) is False @@ -105,7 +122,8 @@ def test_compare_no_match(self): class TestLinkRecordAgainstMpi: @pytest.fixture def patients(self): - bundle = utils.read_json_from_assets("linking", "patient_bundle_to_link_with_mpi.json") + bundle = load_json_asset("simple_patient_bundle_to_link_with_mpi.json") + patients = [] patients: list[schemas.PIIRecord] = [] for entry in bundle["entry"]: if entry.get("resource", {}).get("resourceType", {}) == "Patient": @@ -169,4 +187,4 @@ def test_enhanced_match_three(self, session, enhanced_algorithm, patients: list[ # in second pass name blocks on different cluster and address matches it, # finds greatest strength match and correctly assigns to larger cluster assert matches == [False, True, False, True, False, False, True] - assert sorted(list(mapped_patients.values())) == [1, 1, 1, 4] \ No newline at end of file + assert sorted(list(mapped_patients.values())) == [1, 1, 1, 4] diff --git a/tests/unit/models/test_algorithm.py b/tests/unit/models/test_algorithm.py index 015b556e..f8f8f935 100644 --- a/tests/unit/models/test_algorithm.py +++ b/tests/unit/models/test_algorithm.py @@ -4,11 +4,13 @@ This module contains the unit tests for the recordlinker.models.algorithm module. """ +import unittest.mock import pytest -from recordlinker import models +from recordlinker import config from recordlinker.linking import matchers +from recordlinker.models import algorithm as models class TestAlgorithm: @@ -66,13 +68,53 @@ def test_update_existing_default(self, session): # should not raise any value errors session.commit() + def test_from_dict_no_passes(self): + data = { + "label": "Algorithm 1", + "description": "First algorithm", + } + algo = models.Algorithm.from_dict(**data) + assert algo.label == "Algorithm 1" + assert algo.description == "First algorithm" + assert algo.passes == [] + + def test_from_dict_with_passes(self): + data = { + "label": "Algorithm 1", + "description": "First algorithm", + "passes": [ + { + "blocking_keys": ["ZIP"], + "evaluators": { + "FIRST_NAME": "func:recordlinker.linking.matchers.feature_match_exact", + "LAST_NAME": "func:recordlinker.linking.matchers.feature_match_exact", + }, + "rule": "func:recordlinker.linking.matchers.eval_perfect_match", + "cluster_ratio": 1.0, + } + ], + } + algo = models.Algorithm.from_dict(**data) + assert algo.label == "Algorithm 1" + assert algo.description == "First algorithm" + assert len(algo.passes) == 1 + assert algo.passes[0].blocking_keys == ["ZIP"] + assert algo.passes[0].evaluators == { + "FIRST_NAME": "func:recordlinker.linking.matchers.feature_match_exact", + "LAST_NAME": "func:recordlinker.linking.matchers.feature_match_exact", + } + assert algo.passes[0].rule == "func:recordlinker.linking.matchers.eval_perfect_match" + assert algo.passes[0].cluster_ratio == 1.0 + class TestAlgorithmPass: def test_bound_evaluators(self): """ Tests that the bound_evaluators method returns the correct functions """ - ap = models.AlgorithmPass(evaluators={"BIRTHDATE": "func:recordlinker.linking.matchers.feature_match_any"}) + ap = models.AlgorithmPass( + evaluators={"BIRTHDATE": "func:recordlinker.linking.matchers.feature_match_any"} + ) assert ap.bound_evaluators() == {"BIRTHDATE": matchers.feature_match_any} ap.evaluators = {"BIRTHDATE": "func:recordlinker.linking.matchers.feature_match_exact"} assert ap.bound_evaluators() == {"BIRTHDATE": matchers.feature_match_exact} @@ -91,3 +133,45 @@ def test_bound_rule(self): ap.rule = "func:recordlinker.linking.matchers.invalid" with pytest.raises(ValueError, match="Failed to convert string to callable"): ap.bound_rule() + + +class TestCreateInitialAlgorithms: + def test_invalid_file(self, monkeypatch, session): + """ + Tests that an invalid file raises a FileNotFoundError + """ + monkeypatch.setattr(config.settings, "initial_algorithms", "invalid_file.json") + with pytest.raises(config.ConfigurationError, match="Error loading initial algorithms"): + models.create_initial_algorithms(None, session.connection()) + + def test_no_default(self, monkeypatch, session): + """ + Tests that the initial algorithms are created without a default algorithm + """ + monkeypatch.setattr(config.settings, "initial_algorithms", "file.json") + with unittest.mock.patch("recordlinker.utils.read_json") as read_json: + read_json.return_value = [{"is_default": False}] + with pytest.raises(config.ConfigurationError, match="No default algorithm found"): + models.create_initial_algorithms(None, session.connection()) + + def test_invalid_algorithm(self, monkeypatch, session): + """ + Tests that an invalid algorithm raises a ValueError + """ + monkeypatch.setattr(config.settings, "initial_algorithms", "file.json") + with unittest.mock.patch("recordlinker.utils.read_json") as read_json: + read_json.return_value = [{"labell": "Algorithm 1", "is_default": True}] + with pytest.raises(config.ConfigurationError, match="Error creating initial algorithms"): + models.create_initial_algorithms(None, session.connection()) + + def test_create_initial_algorithms(self, monkeypatch, session): + """ + Tests that the initial algorithms are created + """ + monkeypatch.setattr(config.settings, "initial_algorithms", "file.json") + with unittest.mock.patch("recordlinker.utils.read_json") as read_json: + read_json.return_value = [{"label": "Algorithm 1", "is_default": True}] + models.create_initial_algorithms(None, session.connection()) + assert session.query(models.Algorithm).count() == 1 + assert session.query(models.Algorithm).first().is_default is True + assert session.query(models.Algorithm).first().label == "Algorithm 1" diff --git a/tests/unit/test_main.py b/tests/unit/test_main.py index 26ae960f..55088919 100644 --- a/tests/unit/test_main.py +++ b/tests/unit/test_main.py @@ -1,19 +1,23 @@ import copy import json -import pytest from unittest import mock +import pytest from fastapi import status -from recordlinker import utils from recordlinker import schemas +from recordlinker import utils from recordlinker.linking import link +from conftest import load_json_asset + + def test_health_check(client): actual_response = client.get("/") assert actual_response.status_code == 200 assert actual_response.json() == {"status": "OK"} + def test_openapi(client): actual_response = client.get("/openapi.json") assert actual_response.status_code == 200 @@ -39,7 +43,7 @@ def test_linkrecord_bundle_with_no_patient(self, patched_subprocess, basic_algor @mock.patch("recordlinker.linking.algorithm_service.default_algorithm") def test_linkrecord_success(self, patched_subprocess, basic_algorithm, client): patched_subprocess.return_value = basic_algorithm - test_bundle = utils.read_json_from_assets("patient_bundle_to_link_with_mpi.json") + test_bundle = load_json_asset("patient_bundle_to_link_with_mpi.json") entry_list = copy.deepcopy(test_bundle["entry"]) bundle_1 = test_bundle @@ -97,12 +101,14 @@ def test_linkrecord_success(self, patched_subprocess, basic_algorithm, client): @mock.patch("recordlinker.linking.algorithm_service.get_algorithm") def test_linkrecord_enhanced_algo(self, patched_subprocess, enhanced_algorithm, client): patched_subprocess.return_value = enhanced_algorithm - test_bundle = utils.read_json_from_assets("patient_bundle_to_link_with_mpi.json") + test_bundle = load_json_asset("patient_bundle_to_link_with_mpi.json") entry_list = copy.deepcopy(test_bundle["entry"]) bundle_1 = test_bundle bundle_1["entry"] = [entry_list[0]] - resp_1 = client.post("/link-record", json={"bundle": bundle_1, "algorithm": "dibbs-enhanced"}) + resp_1 = client.post( + "/link-record", json={"bundle": bundle_1, "algorithm": "dibbs-enhanced"} + ) new_bundle = resp_1.json()["updated_bundle"] person_1 = [ r.get("resource") @@ -113,7 +119,9 @@ def test_linkrecord_enhanced_algo(self, patched_subprocess, enhanced_algorithm, bundle_2 = test_bundle bundle_2["entry"] = [entry_list[1]] - resp_2 = client.post("/link-record", json={"bundle": bundle_2, "algorithm": "dibbs-enhanced"}) + resp_2 = client.post( + "/link-record", json={"bundle": bundle_2, "algorithm": "dibbs-enhanced"} + ) new_bundle = resp_2.json()["updated_bundle"] person_2 = [ r.get("resource") @@ -125,12 +133,16 @@ def test_linkrecord_enhanced_algo(self, patched_subprocess, enhanced_algorithm, bundle_3 = test_bundle bundle_3["entry"] = [entry_list[2]] - resp_3 = client.post("/link-record", json={"bundle": bundle_3, "algorithm": "dibbs-enhanced"}) + resp_3 = client.post( + "/link-record", json={"bundle": bundle_3, "algorithm": "dibbs-enhanced"} + ) assert not resp_3.json()["found_match"] bundle_4 = test_bundle bundle_4["entry"] = [entry_list[3]] - resp_4 = client.post("/link-record", json={"bundle": bundle_4, "algorithm": "dibbs-enhanced"}) + resp_4 = client.post( + "/link-record", json={"bundle": bundle_4, "algorithm": "dibbs-enhanced"} + ) new_bundle = resp_4.json()["updated_bundle"] person_4 = [ r.get("resource") @@ -142,20 +154,23 @@ def test_linkrecord_enhanced_algo(self, patched_subprocess, enhanced_algorithm, bundle_5 = test_bundle bundle_5["entry"] = [entry_list[4]] - resp_5 = client.post("/link-record", json={"bundle": bundle_5, "algorithm": "dibbs-enhanced"}) + resp_5 = client.post( + "/link-record", json={"bundle": bundle_5, "algorithm": "dibbs-enhanced"} + ) assert not resp_5.json()["found_match"] bundle_6 = test_bundle bundle_6["entry"] = [entry_list[5]] - resp_6 = client.post("/link-record", json={"bundle": bundle_6, "algorithm": "dibbs-enhanced"}) + resp_6 = client.post( + "/link-record", json={"bundle": bundle_6, "algorithm": "dibbs-enhanced"} + ) new_bundle = resp_6.json()["updated_bundle"] assert not resp_6.json()["found_match"] - @mock.patch("recordlinker.linking.algorithm_service.get_algorithm") def test_linkrecord_invalid_algorithm_param(self, patched_subprocess, client): patched_subprocess.return_value = None - test_bundle = utils.read_json_from_assets("patient_bundle_to_link_with_mpi.json") + test_bundle = load_json_asset("patient_bundle_to_link_with_mpi.json") expected_response = { "found_match": False, "updated_bundle": test_bundle, @@ -169,10 +184,11 @@ def test_linkrecord_invalid_algorithm_param(self, patched_subprocess, client): assert actual_response.json() == expected_response assert actual_response.status_code == status.HTTP_422_UNPROCESSABLE_ENTITY + class TestLink: @pytest.fixture def patients(self): - bundle = utils.read_json_from_assets("linking", "patient_bundle_to_link_with_mpi.json") + bundle = load_json_asset("simple_patient_bundle_to_link_with_mpi.json") patients: list[schemas.PIIRecord] = [] for entry in bundle["entry"]: if entry.get("resource", {}).get("resourceType", {}) == "Patient": @@ -183,63 +199,117 @@ def patients(self): def test_link_success(self, patched_subprocess, basic_algorithm, patients, client): patched_subprocess.return_value = basic_algorithm - response_1 = client.post("/link", json={"record": json.loads(patients[0].model_dump_json(exclude_none=True))}) + response_1 = client.post( + "/link", json={"record": json.loads(patients[0].model_dump_json(exclude_none=True))} + ) person_1 = response_1.json()["person_reference_id"] assert not response_1.json()["is_match"] - response_2 = client.post("/link", json={"record": json.loads(patients[1].model_dump_json(exclude_none=True))}) + response_2 = client.post( + "/link", json={"record": json.loads(patients[1].model_dump_json(exclude_none=True))} + ) person_2 = response_2.json()["person_reference_id"] assert response_2.json()["is_match"] assert person_2 == person_1 - - response_3 = client.post("/link", json={"record": json.loads(patients[2].model_dump_json(exclude_none=True))}) + + response_3 = client.post( + "/link", json={"record": json.loads(patients[2].model_dump_json(exclude_none=True))} + ) assert not response_3.json()["is_match"] # Cluster membership success--justified match - response_4 = client.post("/link", json={"record": json.loads(patients[3].model_dump_json(exclude_none=True))}) + response_4 = client.post( + "/link", json={"record": json.loads(patients[3].model_dump_json(exclude_none=True))} + ) person_4 = response_4.json()["person_reference_id"] assert response_4.json()["is_match"] assert person_4 == person_1 - response_5 = client.post("/link", json={"record": json.loads(patients[4].model_dump_json(exclude_none=True))}) + response_5 = client.post( + "/link", json={"record": json.loads(patients[4].model_dump_json(exclude_none=True))} + ) assert not response_5.json()["is_match"] - response_6 = client.post("/link", json={"record": json.loads(patients[5].model_dump_json(exclude_none=True))}) + response_6 = client.post( + "/link", json={"record": json.loads(patients[5].model_dump_json(exclude_none=True))} + ) assert not response_6.json()["is_match"] @mock.patch("recordlinker.linking.algorithm_service.get_algorithm") - def test_link_enhanced_algorithm(self, patched_subprocess, enhanced_algorithm, patients, client): + def test_link_enhanced_algorithm( + self, patched_subprocess, enhanced_algorithm, patients, client + ): patched_subprocess.return_value = enhanced_algorithm - response_1 = client.post("/link", json={"record": json.loads(patients[0].model_dump_json(exclude_none=True)), "algorithm": "dibbs-enhanced"}) + response_1 = client.post( + "/link", + json={ + "record": json.loads(patients[0].model_dump_json(exclude_none=True)), + "algorithm": "dibbs-enhanced", + }, + ) person_1 = response_1.json()["person_reference_id"] assert not response_1.json()["is_match"] - response_2 = client.post("/link", json={"record": json.loads(patients[1].model_dump_json(exclude_none=True)), "algorithm": "dibbs-enhanced"}) + response_2 = client.post( + "/link", + json={ + "record": json.loads(patients[1].model_dump_json(exclude_none=True)), + "algorithm": "dibbs-enhanced", + }, + ) person_2 = response_2.json()["person_reference_id"] assert response_2.json()["is_match"] assert person_2 == person_1 - - response_3 = client.post("/link", json={"record": json.loads(patients[2].model_dump_json(exclude_none=True)), "algorithm": "dibbs-enhanced"}) + + response_3 = client.post( + "/link", + json={ + "record": json.loads(patients[2].model_dump_json(exclude_none=True)), + "algorithm": "dibbs-enhanced", + }, + ) assert not response_3.json()["is_match"] # Cluster membership success--justified match - response_4 = client.post("/link", json={"record": json.loads(patients[3].model_dump_json(exclude_none=True)), "algorithm": "dibbs-enhanced"}) + response_4 = client.post( + "/link", + json={ + "record": json.loads(patients[3].model_dump_json(exclude_none=True)), + "algorithm": "dibbs-enhanced", + }, + ) person_4 = response_4.json()["person_reference_id"] assert response_4.json()["is_match"] assert person_4 == person_1 - response_5 = client.post("/link", json={"record": json.loads(patients[4].model_dump_json(exclude_none=True)), "algorithm": "dibbs-enhanced"}) + response_5 = client.post( + "/link", + json={ + "record": json.loads(patients[4].model_dump_json(exclude_none=True)), + "algorithm": "dibbs-enhanced", + }, + ) assert not response_5.json()["is_match"] - response_6 = client.post("/link", json={"record": json.loads(patients[5].model_dump_json(exclude_none=True)), "algorithm": "dibbs-enhanced"}) + response_6 = client.post( + "/link", + json={ + "record": json.loads(patients[5].model_dump_json(exclude_none=True)), + "algorithm": "dibbs-enhanced", + }, + ) assert not response_6.json()["is_match"] @mock.patch("recordlinker.linking.algorithm_service.get_algorithm") def test_link_invalid_algorithm_param(self, patched_subprocess, patients, client): patched_subprocess.return_value = None actual_response = client.post( - "/link", json={"record": json.loads(patients[0].model_dump_json(exclude_none=True)), "algorithm": "INVALID"} + "/link", + json={ + "record": json.loads(patients[0].model_dump_json(exclude_none=True)), + "algorithm": "INVALID", + }, ) assert actual_response.status_code == status.HTTP_422_UNPROCESSABLE_ENTITY