3
3
# Functions to convert to different UTF encodings
4
4
5
5
# Quickly copy and set trailing \0
6
- @inline function fast_utf_copy (T:: Type{UInt16} , len, dat)
7
- @inbounds return UTF16String (setindex! (copy! (Vector {T} (len), dat), 0 , len))
8
- end
9
- @inline function fast_utf_copy (T:: Type{Char} , len, dat)
10
- @inbounds return UTF32String (setindex! (copy! (Vector {T} (len), dat), 0 , len))
6
+ @inline function fast_utf_copy {S <: Union(UTF16String, UTF32String), T <: Union(UInt16, Char)} (:: Type{S} , :: Type{T} , len, dat, flag:: Bool = false )
7
+ @inbounds return flag ? S (setindex! (copy! (Vector {T} (len+ 1 ),1 ,dat,1 ,len),0 ,len+ 1 )) : S (setindex! (copy! (Vector {T} (len), dat), 0 , len))
11
8
end
12
9
13
10
# Get rest of character ch from 3-byte UTF-8 sequence in dat
14
- @inline function get_utf8_3 (dat, pos, ch)
11
+ @inline function get_utf8_3byte (dat, pos, ch)
15
12
@inbounds return ((ch & 0xf ) << 12 ) | (UInt32 (dat[pos- 1 ] & 0x3f ) << 6 ) | (dat[pos] & 0x3f )
16
13
end
17
14
18
15
# Get rest of character ch from 4-byte UTF-8 sequence in dat
19
- @inline function get_utf8_4 (dat, pos, ch)
16
+ @inline function get_utf8_4byte (dat, pos, ch)
20
17
@inbounds return (((ch & 0x7 ) << 18 )
21
18
| (UInt32 (dat[pos- 2 ] & 0x3f ) << 12 )
22
19
| (UInt32 (dat[pos- 1 ] & 0x3f ) << 6 )
23
20
| (dat[pos] & 0x3f ))
24
21
end
25
22
26
23
# Output a character as a 4-byte UTF-8 sequence
27
- @inline function output_utf8_4 (buf, out, ch)
24
+ @inline function output_utf8_4byte! (buf, out, ch)
28
25
@inbounds begin
29
26
buf[out + 1 ] = 0xf0 | (ch >>> 18 )
30
27
buf[out + 2 ] = 0x80 | ((ch >>> 12 ) & 0x3f )
@@ -117,11 +114,11 @@ function convert(::Type{UTF16String}, str::UTF8String)
117
114
# Handle range 0x800-0xffff
118
115
elseif ch < 0xf0
119
116
pos += 2
120
- buf[out += 1 ] = get_utf8_3 (dat, pos, ch)
117
+ buf[out += 1 ] = get_utf8_3byte (dat, pos, ch)
121
118
# Handle range 0x10000-0x10ffff
122
119
else
123
120
pos += 3
124
- ch = get_utf8_4 (dat, pos, ch)
121
+ ch = get_utf8_4byte (dat, pos, ch)
125
122
# output surrogate pair
126
123
buf[out += 1 ] = UInt16 (0xd7c0 + (ch >>> 10 ))
127
124
buf[out += 1 ] = UInt16 (0xdc00 + (ch & 0x3ff ))
@@ -241,12 +238,12 @@ function encode_to_utf8{T<:Union(UInt16, UInt32)}(::Type{T}, dat, len)
241
238
buf[out += 1 ] = 0xc0 | (ch >>> 6 )
242
239
buf[out += 1 ] = 0x80 | (ch & 0x3f )
243
240
# Handle 0x10000-0x10ffff (if input is UInt32)
244
- elseif T == UInt32 && ch > 0xffff
245
- output_utf8_4 (buf, out, ch)
241
+ elseif ch > 0xffff # this is only for T == UInt32, should not be generated for UInt16
242
+ output_utf8_4byte! (buf, out, ch)
246
243
out += 4
247
244
# Handle surrogate pairs
248
245
elseif is_surrogate_codeunit (ch)
249
- output_utf8_4 (buf, out, get_supplementary (ch, dat[pos += 1 ]))
246
+ output_utf8_4byte! (buf, out, get_supplementary (ch, dat[pos += 1 ]))
250
247
out += 4
251
248
# Handle 0x800-0xd7ff, 0xe000-0xffff UCS-2 characters
252
249
else
@@ -277,7 +274,7 @@ function convert(::Type{UTF32String}, str::UTF8String)
277
274
len, flags = check_string_utf8 (dat)
278
275
# Optimize case where no characters > 0x7f
279
276
totlen = len+ 1
280
- flags == 0 && return fast_utf_copy (Char, totlen, dat)
277
+ flags == 0 && return fast_utf_copy (UTF32String, Char, totlen, dat)
281
278
# has multi-byte UTF-8 sequences
282
279
buf = Vector {Char} (totlen)
283
280
@inbounds buf[totlen] = 0 # NULL termination
@@ -295,7 +292,7 @@ function convert(::Type{UTF32String}, str::UTF8String)
295
292
# Handle range 0x800-0xffff
296
293
elseif ch < 0xf0
297
294
pos += 2
298
- ch = get_utf8_3 (dat, pos, ch)
295
+ ch = get_utf8_3byte (dat, pos, ch)
299
296
# Handle surrogate pairs (should have been encoded in 4 bytes)
300
297
if is_surrogate_lead (ch)
301
298
# Build up 32-bit character from ch and trailing surrogate in next 3 bytes
@@ -309,7 +306,7 @@ function convert(::Type{UTF32String}, str::UTF8String)
309
306
# Handle range 0x10000-0x10ffff
310
307
else
311
308
pos += 3
312
- buf[out += 1 ] = get_utf8_4 (dat, pos, ch)
309
+ buf[out += 1 ] = get_utf8_4byte (dat, pos, ch)
313
310
end
314
311
end
315
312
UTF32String (buf)
@@ -367,7 +364,7 @@ function convert(::Type{UTF16String}, dat::Vector{UInt32})
367
364
len, flags, num4byte = check_string_utf32 (dat, len>>> 2 )
368
365
len += num4byte + 1
369
366
# optimized path, no surrogates
370
- num4byte == 0 && return fast_utf_copy (UInt16, len, dat)
367
+ num4byte == 0 && return fast_utf_copy (UTF16String, UInt16, len, dat)
371
368
return encode_to_utf16 (dat, len)
372
369
end
373
370
@@ -423,22 +420,31 @@ end
423
420
424
421
convert (:: Type{UTF8String} , dat:: Vector{Char} ) = convert (UTF8String, reinterpret (UInt32, dat))
425
422
423
+ convert (:: Type{UTF16String} , str:: UTF16String ) = str
424
+ convert (:: Type{UTF16String} , dat:: Vector{Char} ) = convert (UTF16String, reinterpret (UInt32, dat))
425
+
426
426
function convert (:: Type{UTF16String} , str:: ASCIIString )
427
427
dat = str. data
428
- fast_utf_copy (UInt16, length (dat)+ 1 , dat)
428
+ fast_utf_copy (UTF16String, UInt16, length (dat)+ 1 , dat)
429
429
end
430
430
431
- function convert (:: Type{UTF32String } , str :: ASCIIString )
432
- dat = str . data
433
- fast_utf_copy (Char, length (dat) + 1 , dat )
431
+ function convert (:: Type{UTF16String } , data :: AbstractVector{UInt16} )
432
+ ! isvalid (UTF16String, data) && throw ( ArgumentError ( " invalid UTF16 data" ))
433
+ fast_utf_copy (UTF16String, UInt16, length (data), data, true )
434
434
end
435
435
436
- convert (:: Type{UTF16String} , str:: UTF16String ) = str
437
- convert (:: Type{UTF16String} , dat:: Vector{Char} ) = convert (UTF16String, reinterpret (UInt32, dat))
438
-
439
436
convert (:: Type{Vector{UInt16}} , str:: UTF16String ) = str. data
440
437
convert (:: Type{Array{UInt16}} , str:: UTF16String ) = str. data
441
438
442
439
convert (:: Type{UTF32String} , str:: UTF32String ) = str
443
440
444
441
convert (:: Type{UTF32String} , c:: Char ) = UTF32String (Char[c, Char (0 )])
442
+
443
+ function convert (:: Type{UTF32String} , str:: ASCIIString )
444
+ dat = str. data
445
+ fast_utf_copy (UTF32String, Char, length (dat)+ 1 , dat)
446
+ end
447
+
448
+ convert (:: Type{UTF32String} , dat:: AbstractVector{Char} ) = fast_utf_copy (UTF32String, Char, length (dat), dat, true )
449
+
450
+
0 commit comments