Skip to content

Commit

Permalink
add cache to feature matrix
Browse files Browse the repository at this point in the history
  • Loading branch information
xu-hao committed Jul 28, 2020
1 parent 97b1ab2 commit b220cb8
Showing 1 changed file with 101 additions and 64 deletions.
165 changes: 101 additions & 64 deletions features/model.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from sqlalchemy import Table, Column, Integer, String, MetaData, func, Sequence, between, Index, text, case, and_
from sqlalchemy import Table, Column, Integer, String, MetaData, func, Sequence, between, Index, text, case, and_, DateTime
from sqlalchemy.sql import select, func
from scipy.stats import chi2_contingency
import json
Expand All @@ -10,6 +10,7 @@
from .features import features, lookUpFeatureClass, features_dict
from tx.functional.maybe import Nothing, Just
import logging
from datetime import datetime, timezone

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -43,6 +44,20 @@ def table_id(table):

cohort_id_seq = Sequence('cohort_id_seq', metadata=metadata)

association_cols = [
Column("table", String),
Column("year", Integer),
Column("cohort_features", String),
Column("feature_a", String),
Column("feature_b", String),
Column("association", String),
Column("access_time", DateTime)
]

cache = Table("cache", metadata, *association_cols)

Index("cache_index", cache.c.table, cache.c.year, cache.c.cohort_features, cache.c.feature_a, cache.c.feature_b)

def op_dict(table, k, v):
return {
">": lambda: table.c[k] > v["value"],
Expand Down Expand Up @@ -202,76 +217,98 @@ def timed(*args, **kw):

@timeit
def select_feature_matrix(conn, table_name, year, cohort_features, feature_a, feature_b):
table = tables[table_name]

ka = feature_a["feature_name"]
vas = feature_a["feature_qualifiers"]
kb = feature_b["feature_name"]
vbs = feature_b["feature_qualifiers"]
cohort_features_json = json.dumps(cohort_features, sort_keys=True)
feature_a_json = json.dumps(feature_a, sort_keys=True)
feature_b_json = json.dumps(feature_b, sort_keys=True)

selections = [
case_select2(table, kb, vb, ka, va) for vb, va in product(vbs, vas)
] + [
case_select(table, ka, va) for va in vas
] + [
case_select(table, kb, vb) for vb in vbs
] + [
func.count()
]

result = []
for i in range(0, len(selections), MAX_ENTRIES_PER_ROW):
subs = selections[i:min(i+MAX_ENTRIES_PER_ROW, len(selections))]

s = select(subs).select_from(table)
if year is not None:
s = s.where(table.c.year == year)
for k, v in cohort_features.items():
s = filter_select(s, table, k, v)

result.extend(list(conn.execute(s).first()))


nvas = len(vas)
nvbs = len(vbs)
mat_size = nvas * nvbs

feature_matrix = np.reshape(result[:mat_size], (nvbs, nvas)).tolist()
result = conn.execute(select([cache.c.association]).select_from(cache).where(cache.c.table == table_name).where(cache.c.year == year).where(cache.c.cohort_features == cohort_features_json).where(cache.c.feature_a == feature_a_json).where(cache.c.feature_b == feature_b_json)).first()


total_cols = result[mat_size : mat_size + nvas]
total_rows = result[mat_size + nvas : mat_size + nvas + nvbs]

total = result[mat_size + nvas + nvbs]
timestamp = datetime.now(timezone.utc)

if result is None:

table = tables[table_name]

ka = feature_a["feature_name"]
vas = feature_a["feature_qualifiers"]
kb = feature_b["feature_name"]
vbs = feature_b["feature_qualifiers"]

selections = [
case_select2(table, kb, vb, ka, va) for vb, va in product(vbs, vas)
] + [
case_select(table, ka, va) for va in vas
] + [
case_select(table, kb, vb) for vb in vbs
] + [
func.count()
]

result = []
for i in range(0, len(selections), MAX_ENTRIES_PER_ROW):
subs = selections[i:min(i+MAX_ENTRIES_PER_ROW, len(selections))]

s = select(subs).select_from(table)
if year is not None:
s = s.where(table.c.year == year)
for k, v in cohort_features.items():
s = filter_select(s, table, k, v)

result.extend(list(conn.execute(s).first()))


nvas = len(vas)
nvbs = len(vbs)
mat_size = nvas * nvbs

feature_matrix = np.reshape(result[:mat_size], (nvbs, nvas)).tolist()


total_cols = result[mat_size : mat_size + nvas]
total_rows = result[mat_size + nvas : mat_size + nvas + nvbs]

total = result[mat_size + nvas + nvbs]

chi_squared, p, *_ = chi2_contingency(list(map(lambda x : list(map(add_eps, x)), feature_matrix)), correction=False)

feature_matrix2 = [
[
{
"frequency": cell,
"row_percentage": div(cell, total_rows[i]),
"column_percentage": div(cell, total_cols[j]),
"total_percentage": div(cell, total)
} for j, cell in enumerate(row)
] for i, row in enumerate(feature_matrix)
]

feature_a = feature_a.copy()
feature_b = feature_b.copy()
feature_a["biolink_class"] = inflection.underscore(lookUpFeatureClass(table_name, ka))
feature_b["biolink_class"] = inflection.underscore(lookUpFeatureClass(table_name, kb))

association = {
"feature_a": feature_a,
"feature_b": feature_b,
"feature_matrix": feature_matrix2,
"rows": [{"frequency": a, "percentage": b} for (a,b) in zip(total_rows, map(lambda x: x/total, total_rows))],
"columns": [{"frequency": a, "percentage": b} for (a,b) in zip(total_cols, map(lambda x: x/total, total_cols))],
"total": total,
"p_value": p,
"chi_squared": chi_squared
}

chi_squared, p, *_ = chi2_contingency(list(map(lambda x : list(map(add_eps, x)), feature_matrix)), correction=False)
association_json = json.dumps(association, sort_keys=True)

feature_matrix2 = [
[
{
"frequency": cell,
"row_percentage": div(cell, total_rows[i]),
"column_percentage": div(cell, total_cols[j]),
"total_percentage": div(cell, total)
} for j, cell in enumerate(row)
] for i, row in enumerate(feature_matrix)
]
conn.execute(cache.insert().values(association=association_json, table=table_name, year=year, cohort_features=cohort_features_json, feature_a=feature_a_json, feature_b=feature_b_json, access_time=timestamp))

feature_a = feature_a.copy()
feature_b = feature_b.copy()
feature_a["biolink_class"] = inflection.underscore(lookUpFeatureClass(table_name, ka))
feature_b["biolink_class"] = inflection.underscore(lookUpFeatureClass(table_name, kb))
else:
association_json = result[0]
association = json.loads(association_json)
conn.execute(cache.update().where(cache.c.table == table_name).where(cache.c.year == year).where(cache.c.cohort_features == cohort_features_json).where(cache.c.feature_a == feature_a_json).where(cache.c.feature_b == feature_b_json).values(access_time=timestamp))

return {
"feature_a": feature_a,
"feature_b": feature_b,
"feature_matrix": feature_matrix2,
"rows": [{"frequency": a, "percentage": b} for (a,b) in zip(total_rows, map(lambda x: x/total, total_rows))],
"columns": [{"frequency": a, "percentage": b} for (a,b) in zip(total_cols, map(lambda x: x/total, total_cols))],
"total": total,
"p_value": p,
"chi_squared": chi_squared
}
return association


def select_feature_count(conn, table_name, year, cohort_features, feature_a):
Expand Down

0 comments on commit b220cb8

Please sign in to comment.