Skip to content

Commit

Permalink
Merge pull request #381 from sfu-db/fix/create_report_hash
Browse files Browse the repository at this point in the history
fix(eda.create_report): handle unhashable dtypes
  • Loading branch information
jinglinpeng authored Oct 3, 2020
2 parents 2153b74 + 7743749 commit fd9dcf0
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 44 deletions.
76 changes: 42 additions & 34 deletions dataprep/eda/create_report/formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,42 +137,45 @@ def format_basic(df: dd.DataFrame) -> Dict[str, Any]:
"col_type": itmdt.visual_type.replace("_column", ""),
}

# interactions
res["has_interaction"] = True
itmdt = Intermediate(data=data["scat"], visual_type="correlation_crossfilter")
rndrd = render_correlation(itmdt)
rndrd.sizing_mode = "stretch_width"
res["interactions"] = components(rndrd)

# correlations
res["has_correlation"] = True
dfs: Dict[str, pd.DataFrame] = {}
for method, corr in data["corrs"].items():
ndf = pd.DataFrame(
{
"x": data["num_cols"][data["cordx"]],
"y": data["num_cols"][data["cordy"]],
"correlation": corr.ravel(),
}
if len(data["num_cols"]) > 0:
# interactions
res["has_interaction"] = True
itmdt = Intermediate(data=data["scat"], visual_type="correlation_crossfilter")
rndrd = render_correlation(itmdt)
rndrd.sizing_mode = "stretch_width"
res["interactions"] = components(rndrd)

# correlations
res["has_correlation"] = True
dfs: Dict[str, pd.DataFrame] = {}
for method, corr in data["corrs"].items():
ndf = pd.DataFrame(
{
"x": data["num_cols"][data["cordx"]],
"y": data["num_cols"][data["cordy"]],
"correlation": corr.ravel(),
}
)
dfs[method.name] = ndf[data["cordy"] > data["cordx"]]
itmdt = Intermediate(
data=dfs,
axis_range=list(data["num_cols"]),
visual_type="correlation_heatmaps",
)
dfs[method.name] = ndf[data["cordy"] > data["cordx"]]
itmdt = Intermediate(
data=dfs, axis_range=list(data["num_cols"]), visual_type="correlation_heatmaps",
)
rndrd = render_correlation(itmdt)
figs.clear()
for tab in rndrd.tabs:
fig = tab.child
fig.sizing_mode = "stretch_width"
fig.title = Title(text=tab.title, align="center", text_font_size="20px")
figs.append(fig)
res["correlations"] = components(figs)
rndrd = render_correlation(itmdt)
figs.clear()
for tab in rndrd.tabs:
fig = tab.child
fig.sizing_mode = "stretch_width"
fig.title = Title(text=tab.title, align="center", text_font_size="20px")
figs.append(fig)
res["correlations"] = components(figs)
else:
res["has_interaction"], res["has_correlation"] = False, False

# missing
res["has_missing"] = True

itmdt = completions["miss"](data["miss"])

rndrd = render_missing(itmdt)
figs.clear()
for tab in rndrd.tabs:
Expand Down Expand Up @@ -200,16 +203,21 @@ def basic_computations(df: dd.DataFrame) -> Tuple[Dict[str, Any], Dict[str, Any]
data["num_cols"] = df_num.columns
first_rows = df.select_dtypes(CATEGORICAL_DTYPES).head

# overview
data["ov"] = calc_stats(df.frame, None)
# # variables
# variables
for col in df.columns:
if is_dtype(detect_dtype(df.frame[col]), Continuous()):
data[col] = cont_comps(df.frame[col], 20)
elif is_dtype(detect_dtype(df.frame[col]), Nominal()):
# cast the column as string type if it contains a mutable type
try:
first_rows[col].apply(hash)
except TypeError:
df.frame[col] = df.frame[col].astype(str)
data[col] = nom_comps(
df.frame[col], first_rows[col], 10, True, 10, 20, True, False, False
)
# overview
data["ov"] = calc_stats(df.frame, None)
# interactions
data["scat"] = df_num.frame.map_partitions(
lambda x: x.sample(min(1000, x.shape[0])), meta=df_num.frame
Expand Down
6 changes: 2 additions & 4 deletions dataprep/eda/distribution/compute/overview.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,10 +241,8 @@ def calc_stats(df: dd.DataFrame, dtype: Optional[DTypeDef]) -> Dict[str, Any]:
----------
df
a DataFrame
dtype_cnts
a dictionary that contains the count for each type
num_cols:
numerical columns in the dataset
dtype
str or DType or dict of str or dict of DType
"""

stats = {"nrows": df.shape[0]}
Expand Down
10 changes: 5 additions & 5 deletions dataprep/eda/distribution/compute/univariate.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,11 @@ def compute_univariate(
col_dtype = detect_dtype(df[x], dtype)
if is_dtype(col_dtype, Nominal()):
first_rows = df[x].head() # dd.Series.head() triggers a (small) data read
# cast the column as string type if it contains a mutable type
try:
first_rows.apply(hash)
except TypeError:
df[x] = df[x].astype(str)
# all computations for plot(df, Nominal())
data = nom_comps(
df[x],
Expand Down Expand Up @@ -170,11 +175,6 @@ def nom_comps(

# total rows
data["nrows"] = srs.shape[0]
# cast the column as string type if it contains a mutable type
try:
first_rows.apply(hash)
except TypeError:
srs = srs.astype(str)
# drop null values
srs = srs.dropna()

Expand Down
3 changes: 2 additions & 1 deletion dataprep/tests/eda/test_create_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ def simpledf() -> pd.DataFrame:
df = pd.concat(
[df, pd.Series(np.random.choice(["a", "b", "c"], 1000, replace=True))], axis=1
)
df = pd.concat([df, pd.Series([["foo"] * 1000])], axis=1)
df = pd.concat(
[
df,
Expand All @@ -29,7 +30,7 @@ def simpledf() -> pd.DataFrame:
axis=1,
)
# df = pd.concat([df, pd.Series(np.zeros(1000))], axis=1)
df.columns = ["a", "b", "c", "d", "e"]
df.columns = ["a", "b", "c", "d", "e", "f"]
# df["e"] = pd.to_datetime(df["e"])

idx = np.arange(1000)
Expand Down

0 comments on commit fd9dcf0

Please sign in to comment.