Skip to content

Commit

Permalink
変換ではなく検証を行うように変更
Browse files Browse the repository at this point in the history
  • Loading branch information
takana-v committed Feb 4, 2025
1 parent fcc835d commit a9a4b19
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 34 deletions.
34 changes: 18 additions & 16 deletions test/unit/user_dict/test_user_dict_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,17 +70,18 @@ def test_convert_to_zenkaku() -> None:


def test_remove_newlines_and_null() -> None:
"""UserDictWord は surface 内の改行や null 文字を削除する。"""
"""UserDictWord は surface 内の改行や null 文字をエラーとする。"""
# Inputs
test_value = generate_model()
test_value["surface"] = "te\n\r\x00st"
# Expects
true_surface = "test"
# Outputs
surface = UserDictWord(**test_value).surface
test_value_newlines = generate_model()
test_value_newlines["surface"] = "te\r\nst"
test_value_null = generate_model()
test_value_null["surface"] = "te\x00st"

# Test
assert surface == true_surface
with pytest.raises(ValidationError):
UserDictWord(**test_value_newlines)
with pytest.raises(ValidationError):
UserDictWord(**test_value_null)


def test_count_mora() -> None:
Expand Down Expand Up @@ -141,17 +142,18 @@ def test_invalid_pronunciation_not_katakana() -> None:


def test_invalid_pronunciation_newlines_and_null() -> None:
"""UserDictWord は pronunciation 内の改行や null 文字を削除する。"""
"""UserDictWord は pronunciation 内の改行や null 文字をエラーとする。"""
# Inputs
test_value = generate_model()
test_value["pronunciation"] = "ボイ\n\r\x00ボ"
# Expects
true_pronunciation = "ボイボ"
# Outputs
pronunciation = UserDictWord(**test_value).pronunciation
test_value_newlines = generate_model()
test_value_newlines["pronunciation"] = "ボイ\r\nボ"
test_value_null = generate_model()
test_value_null["pronunciation"] = "ボイ\x00ボ"

# Test
assert pronunciation == true_pronunciation
with pytest.raises(ValidationError):
UserDictWord(**test_value_newlines)
with pytest.raises(ValidationError):
UserDictWord(**test_value_null)


def test_invalid_pronunciation_invalid_sutegana() -> None:
Expand Down
44 changes: 26 additions & 18 deletions voicevox_engine/user_dict/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,20 @@ class WordTypes(str, Enum):
USER_DICT_MAX_PRIORITY = 10


def remove_newlines_and_null(text: str) -> str:
return text.replace("\n", "").replace("\r", "").replace("\x00", "")
def check_newlines_and_null(text: str) -> str:
if "\n" in text or "\r" in text:
raise ValueError("改行が含まれています。")
if "\x00" in text:
raise ValueError("Null文字が含まれています。")
return text


def remove_comma_and_double_quote(text: str) -> str:
return text.replace(",", "").replace('"', "")
def check_comma_and_double_quote(text: str) -> str:
if "," in text:
raise ValueError("カンマが含まれています。")
if '"' in text:
raise ValueError("ダブルクォートが含まれています。")
return text


def convert_to_zenkaku(surface: str) -> str:
Expand Down Expand Up @@ -66,10 +74,10 @@ def check_is_katakana(pronunciation: str) -> str:
return pronunciation


SanitizedStr = Annotated[
CsvSafeStr = Annotated[
str,
AfterValidator(remove_newlines_and_null),
AfterValidator(remove_comma_and_double_quote),
AfterValidator(check_newlines_and_null),
AfterValidator(check_comma_and_double_quote),
]


Expand All @@ -83,26 +91,26 @@ class UserDictWord(BaseModel):
surface: Annotated[
str,
AfterValidator(convert_to_zenkaku),
AfterValidator(remove_newlines_and_null),
AfterValidator(check_newlines_and_null),
] = Field(description="表層形")
priority: int = Field(
description="優先度", ge=USER_DICT_MIN_PRIORITY, le=USER_DICT_MAX_PRIORITY
)
context_id: int = Field(description="文脈ID", default=1348)
part_of_speech: SanitizedStr = Field(description="品詞")
part_of_speech_detail_1: SanitizedStr = Field(description="品詞細分類1")
part_of_speech_detail_2: SanitizedStr = Field(description="品詞細分類2")
part_of_speech_detail_3: SanitizedStr = Field(description="品詞細分類3")
inflectional_type: SanitizedStr = Field(description="活用型")
inflectional_form: SanitizedStr = Field(description="活用形")
stem: SanitizedStr = Field(description="原形")
yomi: SanitizedStr = Field(description="読み")
pronunciation: Annotated[SanitizedStr, AfterValidator(check_is_katakana)] = Field(
part_of_speech: CsvSafeStr = Field(description="品詞")
part_of_speech_detail_1: CsvSafeStr = Field(description="品詞細分類1")
part_of_speech_detail_2: CsvSafeStr = Field(description="品詞細分類2")
part_of_speech_detail_3: CsvSafeStr = Field(description="品詞細分類3")
inflectional_type: CsvSafeStr = Field(description="活用型")
inflectional_form: CsvSafeStr = Field(description="活用形")
stem: CsvSafeStr = Field(description="原形")
yomi: CsvSafeStr = Field(description="読み")
pronunciation: Annotated[CsvSafeStr, AfterValidator(check_is_katakana)] = Field(
description="発音"
)
accent_type: int = Field(description="アクセント型")
mora_count: int | SkipJsonSchema[None] = Field(default=None, description="モーラ数")
accent_associative_rule: SanitizedStr = Field(description="アクセント結合規則")
accent_associative_rule: CsvSafeStr = Field(description="アクセント結合規則")

@model_validator(mode="after")
def check_mora_count_and_accent_type(self) -> Self:
Expand Down

0 comments on commit a9a4b19

Please sign in to comment.