Skip to content

Commit

Permalink
Fix #34, where defective rows were dropped instead of defective lexemes
Browse files Browse the repository at this point in the history
  • Loading branch information
Sacha Beniamine committed Aug 31, 2024
1 parent 894136b commit 2ea1782
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 2 deletions.
2 changes: 1 addition & 1 deletion src/qumin/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "2.0.0"
__version__ = "2.0.1"
7 changes: 6 additions & 1 deletion src/qumin/representations/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import logging
from collections import defaultdict
from pathlib import Path

import numpy as np
import pandas as pd

Expand Down Expand Up @@ -84,11 +85,15 @@ def get_unknown_segments(forms, unknowns, name):
usecols=["form_id", lexemes, cell_col, form_col])

if not defective:
paradigms.dropna(axis=0, inplace=True)
defective_lexemes = set(paradigms.loc[paradigms[form_col].isna(), lexemes].unique())
paradigms = paradigms[~paradigms.loc[:, lexemes].isin(defective_lexemes)]

if most_freq:
inflected = paradigms.loc[:,lexemes].unique()
lexemes_file_name = Path(dataset.basepath) / dataset.get_resource("lexemes").path
lexemes_df = pd.read_csv(lexemes_file_name, usecols=["lexeme_id", "frequency"])
# Restrict to lexemes we have kept, if we dropped defectives
lexemes_df = lexemes_df[lexemes_df.lexeme_id.isin(inflected)]
selected = set(lexemes_df.sort_values("frequency",
ascending=False
).iloc[:most_freq, :].loc[:, "lexeme_id"].to_list())
Expand Down

0 comments on commit 2ea1782

Please sign in to comment.