diff --git a/alphabet.go b/alphabet.go index 5fe56f8..2b2ff42 100644 --- a/alphabet.go +++ b/alphabet.go @@ -110,7 +110,7 @@ var ( Medium: '\ufea0', Final: '\ufe9e'} - TCHEH = Harf{ // چ + CHEH = Harf{ // چ Unicode: '\u0686', Isolated: '\ufb7a', Beggining: '\ufb7c', @@ -152,14 +152,6 @@ var ( Medium: '\ufeae', Final: '\ufeae'} - JEH = Harf{ - Unicode: '\u0698', - Isolated: '\ufb8a', - Beggining: '\u0698', - Medium: '\ufb8b', - Final: '\ufb8b', - } - ZAIN = Harf{ // ز Unicode: '\u0632', Isolated: '\ufeaf', @@ -167,6 +159,13 @@ var ( Medium: '\ufeb0', Final: '\ufeb0'} + ZHEH = Harf{ // ژ + Unicode: '\u0698', + Isolated: '\ufb8a', + Beggining: '\ufb8a', + Medium: '\ufb8b', + Final: '\ufb8b'} + SEEN = Harf{ // س Unicode: '\u0633', Isolated: '\ufeb1', @@ -244,16 +243,15 @@ var ( Medium: '\ufedc', Final: '\ufeda'} - KEHEH = Harf{ // ک + KEH = Harf{ // ک Unicode: '\u06a9', Isolated: '\ufb8e', Beggining: '\ufb90', Medium: '\ufb91', - Final: '\ufb8f', - } + Final: '\ufb8f'} GAF = Harf{ // گ - Unicode: '\u06af', + Unicode: '\u06AF', Isolated: '\ufb92', Beggining: '\ufb94', Medium: '\ufb95', @@ -294,25 +292,25 @@ var ( Medium: '\ufeee', Final: '\ufeee'} - YEH = Harf{ // ی - Unicode: '\u06cc', - Isolated: '\ufbfc', - Beggining: '\ufbfe', - Medium: '\ufbff', - Final: '\ufbfd'} - - ARABICYEH = Harf{ // ي + YEH = Harf{ // ي Unicode: '\u064a', Isolated: '\ufef1', Beggining: '\ufef3', Medium: '\ufef4', Final: '\ufef2'} + YEH_DOT_BELOW = Harf{ // ي + Unicode: '\u06cc', + Isolated: '\ufeef', + Beggining: '\ufef3', + Medium: '\ufef4', + Final: '\ufef0'} + ALEF_MAKSURA = Harf{ // ى Unicode: '\u0649', Isolated: '\ufeef', - Beggining: '\u0649', - Medium: '\ufef0', + Beggining: '\ufef3', + Medium: '\ufef4', Final: '\ufef0'} TATWEEL = Harf{ // ـ @@ -345,20 +343,21 @@ var alphabet = []Harf{ WAW_HAMZA_ABOVE, ALEF_HAMZA_BELOW, YEH_HAMZA_ABOVE, + YEH_DOT_BELOW, BEH, PEH, TEH, TEH_MARBUTA, THEH, JEEM, - TCHEH, + CHEH, HAH, KHAH, DAL, THAL, REH, - JEH, ZAIN, + ZHEH, SEEN, SHEEN, SAD, @@ -370,7 +369,7 @@ var alphabet = []Harf{ FEH, QAF, KAF, - KEHEH, + KEH, GAF, LAM, MEEM, @@ -378,7 +377,6 @@ var alphabet = []Harf{ HEH, WAW, YEH, - ARABICYEH, ALEF_MAKSURA, TATWEEL, LAM_ALEF, @@ -386,9 +384,32 @@ var alphabet = []Harf{ } // use map for faster lookups. -var tashkeel = map[rune]bool{FATHA: true, FATHATAN: true, DAMMA: true, - DAMMATAN: true, KASRA: true, KASRATAN: true, - SHADDA: true, SUKUN: true} +var tashkeel = map[rune]bool{ + FATHA: true, + FATHATAN: true, + DAMMA: true, + DAMMATAN: true, + KASRA: true, + KASRATAN: true, + SHADDA: true, + SUKUN: true, +} + +var isArabic map[rune]bool + +func fillIsArabicMap() { + if len(isArabic) != 0 { + return + } + isArabic = make(map[rune]bool, len(alphabet)*5) + for _, harf := range alphabet { + isArabic[harf.Beggining] = true + isArabic[harf.Final] = true + isArabic[harf.Isolated] = true + isArabic[harf.Medium] = true + isArabic[harf.Unicode] = true + } +} // use map for faster lookups. // var special_char = map[rune]bool{"": true, ' ': true, '?': true, @@ -408,5 +429,19 @@ var beggining_after = map[Harf]bool{ THAL: true, REH: true, ZAIN: true, + ZHEH: true, WAW: true, ALEF_MAKSURA: true} + +var numeric = map[rune]rune{ + '0': '\u06F0', + '1': '\u06F1', + '2': '\u06F2', + '3': '\u06F3', + '4': '\u06F4', + '5': '\u06F5', + '6': '\u06F6', + '7': '\u06F7', + '8': '\u06F8', + '9': '\u06F9', +} diff --git a/stringutils.go b/stringutils.go index ba81fb8..8c0461d 100644 --- a/stringutils.go +++ b/stringutils.go @@ -1,6 +1,8 @@ // Package goarabic contains utility functions for working with Arabic strings. package goarabic +import "strings" + // Reverse returns its argument string reversed rune-wise left to right. func Reverse(s string) string { r := []rune(s) @@ -62,6 +64,10 @@ func getCharGlyph(previousChar, currentChar, nextChar rune) rune { previousIn := false // in the Arabic Alphabet or not nextIn := false // in the Arabic Alphabet or not + if number, ok := numeric[currentChar]; ok { + return number + } + for _, s := range alphabet { if s.equals(previousChar) { // previousChar in the Arabic Alphabet ? previousIn = true @@ -78,8 +84,27 @@ func getCharGlyph(previousChar, currentChar, nextChar rune) rune { continue } + if currentChar == LAM.Unicode { + if nextChar == ALEF.Unicode { + if previousIn { + return LAM_ALEF.Medium + } + return LAM_ALEF.Beggining + } + if nextChar == ALEF_HAMZA_ABOVE.Unicode { + if previousIn { + return LAM_ALEF_HAMZA_ABOVE.Medium + } + return LAM_ALEF_HAMZA_ABOVE.Beggining + } + } + + if previousChar == LAM.Unicode && (currentChar == ALEF.Unicode || currentChar == ALEF_HAMZA_ABOVE.Unicode) { + return 0 + } + if previousIn && nextIn { // between two Arabic Alphabet, return the medium glyph - for s, _ := range beggining_after { + for s := range beggining_after { if s.equals(previousChar) { return getHarf(currentChar).Beggining } @@ -93,7 +118,7 @@ func getCharGlyph(previousChar, currentChar, nextChar rune) rune { } if previousIn { // final (because the next is not in the Arabic Alphabet) - for s, _ := range beggining_after { + for s := range beggining_after { if s.equals(previousChar) { return getHarf(currentChar).Isolated } @@ -139,11 +164,10 @@ func getHarf(char rune) Harf { return Harf{Unicode: char, Isolated: char, Medium: char, Final: char} } -//RemoveAllNonAlphabetChars deletes all characters which are not included in Arabic Alphabet +// RemoveAllNonAlphabetChars deletes all characters which are not included in Arabic Alphabet func RemoveAllNonArabicChars(text string) string { - runes := []rune(text) newText := []rune{} - for _, current := range runes { + for _, current := range text { inAlphabet := false for _, s := range alphabet { if s.equals(current) { @@ -157,6 +181,31 @@ func RemoveAllNonArabicChars(text string) string { return string(newText) } +// FixArabic searches for arabic words in text and fix their presentation form +func FixArabic(text string) string { + if len(text) == 0 { + return text + } + var sb strings.Builder + fillIsArabicMap() + words := strings.Fields(text) + fixedWords := make([]string, 0) + for _, word := range words { + runes := []rune(word) + if isArabic[runes[0]] { + fixedWords = append(fixedWords, Reverse(ToGlyph(word))) + } else { + fixedWords = append(fixedWords, word) + } + } + for i := len(words) - 1; i >= 0; i-- { + sb.WriteString(" ") + sb.WriteString(fixedWords[i]) + } + //len := sb.Len() + return sb.String()[1:] //remove trailing space +} + // ToGlyph returns the glyph representation of the given text func ToGlyph(text string) string { var prev, next rune @@ -182,6 +231,9 @@ func ToGlyph(text string) string { // get the current char representation or return the same if unnecessary glyph := getCharGlyph(prev, current, next) + if glyph == 0 { + continue + } // append the new char representation to the newText newText = append(newText, glyph) diff --git a/stringutils_test.go b/stringutils_test.go index ac7ca8e..aa8624a 100644 --- a/stringutils_test.go +++ b/stringutils_test.go @@ -66,7 +66,12 @@ func TestToGlyph(t *testing.T) { want string }{ {"تجربة النص العربي", "\ufe97\ufea0\ufeae\ufe91\ufe94 \u0627\ufedf\ufee8\ufeba \u0627\ufedf\ufecc\ufeae\ufe91\ufef2"}, + {"تجربة لا النص العربي", "\ufe97\ufea0\ufeae\ufe91\ufe94 \ufefb \u0627\ufedf\ufee8\ufeba \u0627\ufedf\ufecc\ufeae\ufe91\ufef2"}, + {"۰۱۲۳۴۵۶۷۸۹", "۰۱۲۳۴۵۶۷۸۹"}, + {"0123456789", "۰۱۲۳۴۵۶۷۸۹"}, + {"0123456789", "۰۱۲۳۴۵۶۷۸۹"}, {"", ""}, + {"Sample english", "Sample english"}, } for _, c := range cases { got := ToGlyph(c.in) @@ -76,6 +81,22 @@ func TestToGlyph(t *testing.T) { } } +func TestFixArabic(t *testing.T) { + cases := []struct { + in string + want string + }{ + {"تجربة text العربي", "ﻲﺑﺮﻌﻟا text ﺔﺑﺮﺠﺗ"}, + {"Sample جمله english", "english ﻪﻠﻤﺟ Sample"}, + } + for _, c := range cases { + got := FixArabic(c.in) + if got != c.want { + t.Errorf("FixArabic(...) got %q, want %+q", got, c.want) + } + } +} + func TestRemoveTatweel(t *testing.T) { cases := []struct { in, want string