Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace whatlanggo with an ad-hoc implementation #3041

Merged
merged 1 commit into from
Dec 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ module miniflux.app/v2

require (
github.com/PuerkitoBio/goquery v1.10.0
github.com/abadojack/whatlanggo v1.0.1
github.com/andybalholm/brotli v1.1.1
github.com/coreos/go-oidc/v3 v3.11.0
github.com/go-webauthn/webauthn v0.11.2
Expand Down
2 changes: 0 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
github.com/PuerkitoBio/goquery v1.10.0 h1:6fiXdLuUvYs2OJSvNRqlNPoBm6YABE226xrbavY5Wv4=
github.com/PuerkitoBio/goquery v1.10.0/go.mod h1:TjZZl68Q3eGHNBA8CWaxAN7rOU1EbDz3CWuolcO5Yu4=
github.com/abadojack/whatlanggo v1.0.1 h1:19N6YogDnf71CTHm3Mp2qhYfkRdyvbgwWdd2EPxJRG4=
github.com/abadojack/whatlanggo v1.0.1/go.mod h1:66WiQbSbJBIlOZMsvbKe5m6pzQovxCH9B/K8tQB2uoc=
github.com/andybalholm/brotli v1.1.1 h1:PR2pgnyFznKEugtsUo0xLdDop5SKXd5Qf5ysW+7XdTA=
github.com/andybalholm/brotli v1.1.1/go.mod h1:05ib4cKhjx3OQYUY22hTVd34Bc8upXjOLL2rKwwZBoA=
github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss=
Expand Down
38 changes: 21 additions & 17 deletions internal/reader/readingtime/readingtime.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,33 +7,37 @@ package readingtime
import (
"math"
"strings"
"unicode"
"unicode/utf8"

"miniflux.app/v2/internal/reader/sanitizer"

"github.com/abadojack/whatlanggo"
)

// EstimateReadingTime returns the estimated reading time of an article in minute.
func EstimateReadingTime(content string, defaultReadingSpeed, cjkReadingSpeed int) int {
sanitizedContent := sanitizer.StripTags(content)
truncationPoint := min(len(sanitizedContent), 50)

// Litterature on language detection says that around 100 signes is enough, we're safe here.
truncationPoint := min(len(sanitizedContent), 250)

// We're only interested in identifying Japanse/Chinese/Korean
options := whatlanggo.Options{
Whitelist: map[whatlanggo.Lang]bool{
whatlanggo.Jpn: true,
whatlanggo.Cmn: true,
whatlanggo.Kor: true,
},
if isCJK(sanitizedContent[:truncationPoint]) {
return int(math.Ceil(float64(utf8.RuneCountInString(sanitizedContent)) / float64(cjkReadingSpeed)))
}
langInfo := whatlanggo.DetectWithOptions(sanitizedContent[:truncationPoint], options)
return int(math.Ceil(float64(len(strings.Fields(sanitizedContent))) / float64(defaultReadingSpeed)))
}

if langInfo.IsReliable() {
return int(math.Ceil(float64(utf8.RuneCountInString(sanitizedContent)) / float64(cjkReadingSpeed)))
func isCJK(text string) bool {
totalCJK := 0

for _, r := range text[:min(len(text), 50)] {
if unicode.Is(unicode.Han, r) ||
unicode.Is(unicode.Hangul, r) ||
unicode.Is(unicode.Hiragana, r) ||
unicode.Is(unicode.Katakana, r) ||
unicode.Is(unicode.Yi, r) ||
unicode.Is(unicode.Bopomofo, r) {
totalCJK++
}
}
nbOfWords := len(strings.Fields(sanitizedContent))
return int(math.Ceil(float64(nbOfWords) / float64(defaultReadingSpeed)))

// if at least 50% of the text is CJK, odds are that the text is in CJK.
return totalCJK > len(text)/50
}
Loading