Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Persian Characters #12

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 65 additions & 30 deletions alphabet.go
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ var (
Medium: '\ufea0',
Final: '\ufe9e'}

TCHEH = Harf{ // چ
CHEH = Harf{ // چ
Unicode: '\u0686',
Isolated: '\ufb7a',
Beggining: '\ufb7c',
Expand Down Expand Up @@ -152,21 +152,20 @@ var (
Medium: '\ufeae',
Final: '\ufeae'}

JEH = Harf{
Unicode: '\u0698',
Isolated: '\ufb8a',
Beggining: '\u0698',
Medium: '\ufb8b',
Final: '\ufb8b',
}

ZAIN = Harf{ // ز
Unicode: '\u0632',
Isolated: '\ufeaf',
Beggining: '\u0632',
Medium: '\ufeb0',
Final: '\ufeb0'}

ZHEH = Harf{ // ژ
Unicode: '\u0698',
Isolated: '\ufb8a',
Beggining: '\ufb8a',
Medium: '\ufb8b',
Final: '\ufb8b'}

SEEN = Harf{ // س
Unicode: '\u0633',
Isolated: '\ufeb1',
Expand Down Expand Up @@ -244,16 +243,15 @@ var (
Medium: '\ufedc',
Final: '\ufeda'}

KEHEH = Harf{ // ک
KEH = Harf{ // ک
Unicode: '\u06a9',
Isolated: '\ufb8e',
Beggining: '\ufb90',
Medium: '\ufb91',
Final: '\ufb8f',
}
Final: '\ufb8f'}

GAF = Harf{ // گ
Unicode: '\u06af',
Unicode: '\u06AF',
Isolated: '\ufb92',
Beggining: '\ufb94',
Medium: '\ufb95',
Expand Down Expand Up @@ -294,25 +292,25 @@ var (
Medium: '\ufeee',
Final: '\ufeee'}

YEH = Harf{ // ی
Unicode: '\u06cc',
Isolated: '\ufbfc',
Beggining: '\ufbfe',
Medium: '\ufbff',
Final: '\ufbfd'}

ARABICYEH = Harf{ // ي
YEH = Harf{ // ي
Unicode: '\u064a',
Isolated: '\ufef1',
Beggining: '\ufef3',
Medium: '\ufef4',
Final: '\ufef2'}

YEH_DOT_BELOW = Harf{ // ي
Unicode: '\u06cc',
Isolated: '\ufeef',
Beggining: '\ufef3',
Medium: '\ufef4',
Final: '\ufef0'}

ALEF_MAKSURA = Harf{ // ى
Unicode: '\u0649',
Isolated: '\ufeef',
Beggining: '\u0649',
Medium: '\ufef0',
Beggining: '\ufef3',
Medium: '\ufef4',
Final: '\ufef0'}

TATWEEL = Harf{ // ـ
Expand Down Expand Up @@ -345,20 +343,21 @@ var alphabet = []Harf{
WAW_HAMZA_ABOVE,
ALEF_HAMZA_BELOW,
YEH_HAMZA_ABOVE,
YEH_DOT_BELOW,
BEH,
PEH,
TEH,
TEH_MARBUTA,
THEH,
JEEM,
TCHEH,
CHEH,
HAH,
KHAH,
DAL,
THAL,
REH,
JEH,
ZAIN,
ZHEH,
SEEN,
SHEEN,
SAD,
Expand All @@ -370,25 +369,47 @@ var alphabet = []Harf{
FEH,
QAF,
KAF,
KEHEH,
KEH,
GAF,
LAM,
MEEM,
NOON,
HEH,
WAW,
YEH,
ARABICYEH,
ALEF_MAKSURA,
TATWEEL,
LAM_ALEF,
LAM_ALEF_HAMZA_ABOVE,
}

// use map for faster lookups.
var tashkeel = map[rune]bool{FATHA: true, FATHATAN: true, DAMMA: true,
DAMMATAN: true, KASRA: true, KASRATAN: true,
SHADDA: true, SUKUN: true}
var tashkeel = map[rune]bool{
FATHA: true,
FATHATAN: true,
DAMMA: true,
DAMMATAN: true,
KASRA: true,
KASRATAN: true,
SHADDA: true,
SUKUN: true,
}

var isArabic map[rune]bool

func fillIsArabicMap() {
if len(isArabic) != 0 {
return
}
isArabic = make(map[rune]bool, len(alphabet)*5)
for _, harf := range alphabet {
isArabic[harf.Beggining] = true
isArabic[harf.Final] = true
isArabic[harf.Isolated] = true
isArabic[harf.Medium] = true
isArabic[harf.Unicode] = true
}
}

// use map for faster lookups.
// var special_char = map[rune]bool{"": true, ' ': true, '?': true,
Expand All @@ -408,5 +429,19 @@ var beggining_after = map[Harf]bool{
THAL: true,
REH: true,
ZAIN: true,
ZHEH: true,
WAW: true,
ALEF_MAKSURA: true}

var numeric = map[rune]rune{
'0': '\u06F0',
'1': '\u06F1',
'2': '\u06F2',
'3': '\u06F3',
'4': '\u06F4',
'5': '\u06F5',
'6': '\u06F6',
'7': '\u06F7',
'8': '\u06F8',
'9': '\u06F9',
}
62 changes: 57 additions & 5 deletions stringutils.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
// Package goarabic contains utility functions for working with Arabic strings.
package goarabic

import "strings"

// Reverse returns its argument string reversed rune-wise left to right.
func Reverse(s string) string {
r := []rune(s)
Expand Down Expand Up @@ -62,6 +64,10 @@ func getCharGlyph(previousChar, currentChar, nextChar rune) rune {
previousIn := false // in the Arabic Alphabet or not
nextIn := false // in the Arabic Alphabet or not

if number, ok := numeric[currentChar]; ok {
return number
}

for _, s := range alphabet {
if s.equals(previousChar) { // previousChar in the Arabic Alphabet ?
previousIn = true
Expand All @@ -78,8 +84,27 @@ func getCharGlyph(previousChar, currentChar, nextChar rune) rune {
continue
}

if currentChar == LAM.Unicode {
if nextChar == ALEF.Unicode {
if previousIn {
return LAM_ALEF.Medium
}
return LAM_ALEF.Beggining
}
if nextChar == ALEF_HAMZA_ABOVE.Unicode {
if previousIn {
return LAM_ALEF_HAMZA_ABOVE.Medium
}
return LAM_ALEF_HAMZA_ABOVE.Beggining
}
}

if previousChar == LAM.Unicode && (currentChar == ALEF.Unicode || currentChar == ALEF_HAMZA_ABOVE.Unicode) {
return 0
}

if previousIn && nextIn { // between two Arabic Alphabet, return the medium glyph
for s, _ := range beggining_after {
for s := range beggining_after {
if s.equals(previousChar) {
return getHarf(currentChar).Beggining
}
Expand All @@ -93,7 +118,7 @@ func getCharGlyph(previousChar, currentChar, nextChar rune) rune {
}

if previousIn { // final (because the next is not in the Arabic Alphabet)
for s, _ := range beggining_after {
for s := range beggining_after {
if s.equals(previousChar) {
return getHarf(currentChar).Isolated
}
Expand Down Expand Up @@ -139,11 +164,10 @@ func getHarf(char rune) Harf {
return Harf{Unicode: char, Isolated: char, Medium: char, Final: char}
}

//RemoveAllNonAlphabetChars deletes all characters which are not included in Arabic Alphabet
// RemoveAllNonAlphabetChars deletes all characters which are not included in Arabic Alphabet
func RemoveAllNonArabicChars(text string) string {
runes := []rune(text)
newText := []rune{}
for _, current := range runes {
for _, current := range text {
inAlphabet := false
for _, s := range alphabet {
if s.equals(current) {
Expand All @@ -157,6 +181,31 @@ func RemoveAllNonArabicChars(text string) string {
return string(newText)
}

// FixArabic searches for arabic words in text and fix their presentation form
func FixArabic(text string) string {
if len(text) == 0 {
return text
}
var sb strings.Builder
fillIsArabicMap()
words := strings.Fields(text)
fixedWords := make([]string, 0)
for _, word := range words {
runes := []rune(word)
if isArabic[runes[0]] {
fixedWords = append(fixedWords, Reverse(ToGlyph(word)))
} else {
fixedWords = append(fixedWords, word)
}
}
for i := len(words) - 1; i >= 0; i-- {
sb.WriteString(" ")
sb.WriteString(fixedWords[i])
}
//len := sb.Len()
return sb.String()[1:] //remove trailing space
}

// ToGlyph returns the glyph representation of the given text
func ToGlyph(text string) string {
var prev, next rune
Expand All @@ -182,6 +231,9 @@ func ToGlyph(text string) string {

// get the current char representation or return the same if unnecessary
glyph := getCharGlyph(prev, current, next)
if glyph == 0 {
continue
}

// append the new char representation to the newText
newText = append(newText, glyph)
Expand Down
21 changes: 21 additions & 0 deletions stringutils_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,12 @@ func TestToGlyph(t *testing.T) {
want string
}{
{"تجربة النص العربي", "\ufe97\ufea0\ufeae\ufe91\ufe94 \u0627\ufedf\ufee8\ufeba \u0627\ufedf\ufecc\ufeae\ufe91\ufef2"},
{"تجربة لا النص العربي", "\ufe97\ufea0\ufeae\ufe91\ufe94 \ufefb \u0627\ufedf\ufee8\ufeba \u0627\ufedf\ufecc\ufeae\ufe91\ufef2"},
{"۰۱۲۳۴۵۶۷۸۹", "۰۱۲۳۴۵۶۷۸۹"},
{"0123456789", "۰۱۲۳۴۵۶۷۸۹"},
{"0123456789", "۰۱۲۳۴۵۶۷۸۹"},
{"", ""},
{"Sample english", "Sample english"},
}
for _, c := range cases {
got := ToGlyph(c.in)
Expand All @@ -76,6 +81,22 @@ func TestToGlyph(t *testing.T) {
}
}

func TestFixArabic(t *testing.T) {
cases := []struct {
in string
want string
}{
{"تجربة text العربي", "ﻲﺑﺮﻌﻟا text ﺔﺑﺮﺠﺗ"},
{"Sample جمله english", "english ﻪﻠﻤﺟ Sample"},
}
for _, c := range cases {
got := FixArabic(c.in)
if got != c.want {
t.Errorf("FixArabic(...) got %q, want %+q", got, c.want)
}
}
}

func TestRemoveTatweel(t *testing.T) {
cases := []struct {
in, want string
Expand Down