Skip to content

Commit

Permalink
Merge pull request #19 from ahlec/strip-furigana-before-reading
Browse files Browse the repository at this point in the history
Remove <ruby> furigana before generating readings
  • Loading branch information
obynio authored Oct 7, 2022
2 parents 6213c22 + e7651c3 commit 117ab41
Show file tree
Hide file tree
Showing 3 changed files with 66 additions and 18 deletions.
24 changes: 6 additions & 18 deletions __init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
# You should have received a copy of the GNU General Public License
# along with Japanese Furigana. If not, see <http://www.gnu.org/licenses/>.

import re
import os

from aqt.utils import tooltip
Expand All @@ -29,6 +28,7 @@
from . import reading
from . import config
from .selection import Selection
from .utils import removeFurigana

mecab = reading.MecabController()
config = config.Config()
Expand Down Expand Up @@ -57,31 +57,19 @@ def doIt(editor, action):

def generateFurigana(editor, s):
html = s.selected
html = re.sub('\[[^\]]*\]', '', html)
html = removeFurigana(html)
html = mecab.reading(html, config.getIgnoreNumbers(), config.getUseRubyTags())
if html == s.selected:
tooltip("Nothing to generate!")
else:
s.modify(html)

def deleteFurigana(editor, s):
html = s.selected
if config.getUseRubyTags():
betweens = list(map(lambda x: "<ruby>"+x+"</ruby>", re.findall(r"<ruby>(.*?)<\/ruby>", html)))
if len(betweens) == 0:
tooltip("No furigana found to delete")
else:
for b in betweens:
replacement = re.search(r"<ruby>(.*?)<rp>",b).group(1).strip()
html = html.replace(b, replacement)
s.modify(html)
stripped = removeFurigana(s.selected)
if stripped == s.selected:
tooltip("No furigana found to delete")
else:
html, deletions = re.subn('\[[^\]]*\]', '', html)

if deletions == 0:
tooltip("No furigana found to delete")
else:
s.modify(html)
s.modify(stripped)

setupGuiMenu()
addHook("setupEditorButtons", addButtons)
34 changes: 34 additions & 0 deletions test/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import unittest

import utils

class TestRemoveFurigana(unittest.TestCase):

# empty string should return empty string
def testEmptyString(self):
self.assertEqual(utils.removeFurigana(""), "")

# ensure that bracket notation is correctly removed
def testRemovesBrackets(self):
self.assertEqual(utils.removeFurigana("日本語[にほんご]を勉強[べんきょう]する"), "日本語を勉強する")
self.assertEqual(utils.removeFurigana("走[はし]り込[こ]む"), "走り込む")

# ensure that ruby tags are correctly removed
def testRemovesRuby(self):
self.assertEqual(utils.removeFurigana("<ruby>日本語<rp>(</rp><rt>にほんご</rt><rp>)</rp></ruby>を<ruby>勉強<rp>(</rp><rt>べんきょう</rt><rp>)</rp></ruby>する"), "日本語を勉強する")
self.assertEqual(utils.removeFurigana("<ruby>走<rp>(</rp><rt>はし</rt><rp>)</rp></ruby>り<ruby>込<rp>(</rp><rt>こ</rt><rp>)</rp></ruby>む"), "走り込む")

# ensure that <ruby /> tags without the inessential <rp /> tags are stripped
def testRemovesRubyWithoutRp(self):
self.assertEqual(utils.removeFurigana("<ruby>日本語<rt>にほんご</rt></ruby>を<ruby>勉強<rt>べんきょう</rt></ruby>する"), "日本語を勉強する")
self.assertEqual(utils.removeFurigana("<ruby>走<rt>はし</rt></ruby>り<ruby>込<rt>こ</rt></ruby>む"), "走り込む")

# ensure that non-<ruby> related HTML tags are preserved
def testPreservesOtherHtml(self):
self.assertEqual(utils.removeFurigana("<b>日本語</b>"), "<b>日本語</b>")
self.assertEqual(utils.removeFurigana("ビルの<ruby>形<rp>(</rp><rt>かたち</rt><rp>)</rp></ruby>はほぼ<b><u><ruby>正方形<rp>(</rp><rt>せいほうけい</rt><rp>)</rp></ruby></u></b>だった。"), "ビルの形はほぼ<b><u>正方形</u></b>だった。")

# ensure that the utility function will remove both styles from the same string
# (which also ensures that we're decoupled from the user's current config selection)
def testRemovesBothNotations(self):
self.assertEqual(utils.removeFurigana("<ruby>日本語<rp>(</rp><rt>にほんご</rt><rp>)</rp></ruby>を勉強[べんきょう]する"), "日本語を勉強する")
26 changes: 26 additions & 0 deletions utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# -*- coding: utf-8 -*-

import re

def removeFurigana(text: str):
stripped = text

# First, remove Ruby tags
rubyTags: list[str] = re.findall(r"<ruby>(.*?)<\/ruby>", stripped)
for ruby in rubyTags:
# Figure out what the actual body of the <ruby /> tag is.
# Current approach: strip away any HTML tags that handle the annotation, to
# arrive at just the body. Considering only the current HTML specification,
# the tags to strip away are: <rp>, <rt>
body = re.sub(r"<rp>(.*?)<\/rp>|<rt>(.*?)<\/rt>", "", ruby)

# Replace the entire <ruby> block with just the body.
# NOTE: We'll need to include the <ruby> tags around the search string, since
# they aren't included in the original regex response
stripped = stripped.replace("<ruby>" + ruby + "</ruby>", body)

# Next, remove the bracket notation
stripped, _ = re.subn('\[[^\]]*\]', '', stripped)

# Return the final string
return stripped

0 comments on commit 117ab41

Please sign in to comment.