Skip to content

Commit 58cc026

Browse files
committed
Update based on review comments
1 parent 7c57f34 commit 58cc026

File tree

5 files changed

+35
-38
lines changed

5 files changed

+35
-38
lines changed

base/sysimg.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,12 +87,12 @@ include("osutils.jl")
8787
include("utferror.jl")
8888
include("utftype.jl")
8989
include("utfcheck.jl")
90+
include("utfconvert.jl")
9091
include("char.jl")
9192
include("ascii.jl")
9293
include("utf8.jl")
9394
include("utf16.jl")
9495
include("utf32.jl")
95-
include("utfconvert.jl")
9696
include("iobuffer.jl")
9797
include("string.jl")
9898
include("utf8proc.jl")

base/utf16.jl

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -74,12 +74,6 @@ end
7474
unsafe_convert{T<:Union(Int16,UInt16)}(::Type{Ptr{T}}, s::UTF16String) =
7575
convert(Ptr{T}, pointer(s))
7676

77-
function convert(::Type{UTF16String}, data::AbstractVector{UInt16})
78-
!isvalid(UTF16String, data) && throw(ArgumentError("invalid UTF16 data"))
79-
len = length(data)
80-
@inbounds return UTF16String(setindex!(copy!(Vector{UInt16}(len+1),1,data,1,len),0,len+1))
81-
end
82-
8377
convert(T::Type{UTF16String}, data::AbstractArray{UInt16}) =
8478
convert(T, reshape(data, length(data)))
8579

base/utf32.jl

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,6 @@ reverse(s::UTF32String) = UTF32String(reverse!(copy(s.data), 1, length(s)))
99

1010
sizeof(s::UTF32String) = sizeof(s.data) - sizeof(Char)
1111

12-
function convert(::Type{UTF32String}, data::AbstractVector{Char})
13-
len = length(data)
14-
@inbounds return UTF32String(setindex!(copy!(Vector{Char}(len+1),1,data,1,len),0,len+1))
15-
end
16-
1712
convert{T<:Union(Int32,UInt32)}(::Type{UTF32String}, data::AbstractVector{T}) =
1813
convert(UTF32String, reinterpret(Char, data))
1914

@@ -45,7 +40,9 @@ function convert(T::Type{UTF32String}, bytes::AbstractArray{UInt8})
4540
copy!(d,1, data, 2, length(data)-1)
4641
elseif data[1] == Char(0xfffe0000) # byte-swapped
4742
d = Array(Char, length(data))
48-
@inbounds for i = 2:length(data) ; d[i-1] = bswap(data[i]) ; end
43+
for i = 2:length(data)
44+
@inbounds d[i-1] = bswap(data[i])
45+
end
4946
else
5047
d = Array(Char, length(data) + 1)
5148
copy!(d, 1, data, 1, length(data)) # assume native byte order

base/utfconvert.jl

Lines changed: 30 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -3,28 +3,25 @@
33
# Functions to convert to different UTF encodings
44

55
# Quickly copy and set trailing \0
6-
@inline function fast_utf_copy(T::Type{UInt16}, len, dat)
7-
@inbounds return UTF16String(setindex!(copy!(Vector{T}(len), dat), 0, len))
8-
end
9-
@inline function fast_utf_copy(T::Type{Char}, len, dat)
10-
@inbounds return UTF32String(setindex!(copy!(Vector{T}(len), dat), 0, len))
6+
@inline function fast_utf_copy{S <: Union(UTF16String, UTF32String), T <: Union(UInt16, Char)}(::Type{S}, ::Type{T}, len, dat, flag::Bool=false)
7+
@inbounds return flag ? S(setindex!(copy!(Vector{T}(len+1),1,dat,1,len),0,len+1)) : S(setindex!(copy!(Vector{T}(len), dat), 0, len))
118
end
129

1310
# Get rest of character ch from 3-byte UTF-8 sequence in dat
14-
@inline function get_utf8_3(dat, pos, ch)
11+
@inline function get_utf8_3byte(dat, pos, ch)
1512
@inbounds return ((ch & 0xf) << 12) | (UInt32(dat[pos-1] & 0x3f) << 6) | (dat[pos] & 0x3f)
1613
end
1714

1815
# Get rest of character ch from 4-byte UTF-8 sequence in dat
19-
@inline function get_utf8_4(dat, pos, ch)
16+
@inline function get_utf8_4byte(dat, pos, ch)
2017
@inbounds return (((ch & 0x7) << 18)
2118
| (UInt32(dat[pos-2] & 0x3f) << 12)
2219
| (UInt32(dat[pos-1] & 0x3f) << 6)
2320
| (dat[pos] & 0x3f))
2421
end
2522

2623
# Output a character as a 4-byte UTF-8 sequence
27-
@inline function output_utf8_4(buf, out, ch)
24+
@inline function output_utf8_4byte!(buf, out, ch)
2825
@inbounds begin
2926
buf[out + 1] = 0xf0 | (ch >>> 18)
3027
buf[out + 2] = 0x80 | ((ch >>> 12) & 0x3f)
@@ -117,11 +114,11 @@ function convert(::Type{UTF16String}, str::UTF8String)
117114
# Handle range 0x800-0xffff
118115
elseif ch < 0xf0
119116
pos += 2
120-
buf[out += 1] = get_utf8_3(dat, pos, ch)
117+
buf[out += 1] = get_utf8_3byte(dat, pos, ch)
121118
# Handle range 0x10000-0x10ffff
122119
else
123120
pos += 3
124-
ch = get_utf8_4(dat, pos, ch)
121+
ch = get_utf8_4byte(dat, pos, ch)
125122
# output surrogate pair
126123
buf[out += 1] = UInt16(0xd7c0 + (ch >>> 10))
127124
buf[out += 1] = UInt16(0xdc00 + (ch & 0x3ff))
@@ -241,12 +238,12 @@ function encode_to_utf8{T<:Union(UInt16, UInt32)}(::Type{T}, dat, len)
241238
buf[out += 1] = 0xc0 | (ch >>> 6)
242239
buf[out += 1] = 0x80 | (ch & 0x3f)
243240
# Handle 0x10000-0x10ffff (if input is UInt32)
244-
elseif T == UInt32 && ch > 0xffff
245-
output_utf8_4(buf, out, ch)
241+
elseif ch > 0xffff # this is only for T == UInt32, should not be generated for UInt16
242+
output_utf8_4byte!(buf, out, ch)
246243
out += 4
247244
# Handle surrogate pairs
248245
elseif is_surrogate_codeunit(ch)
249-
output_utf8_4(buf, out, get_supplementary(ch, dat[pos += 1]))
246+
output_utf8_4byte!(buf, out, get_supplementary(ch, dat[pos += 1]))
250247
out += 4
251248
# Handle 0x800-0xd7ff, 0xe000-0xffff UCS-2 characters
252249
else
@@ -277,7 +274,7 @@ function convert(::Type{UTF32String}, str::UTF8String)
277274
len, flags = check_string_utf8(dat)
278275
# Optimize case where no characters > 0x7f
279276
totlen = len+1
280-
flags == 0 && return fast_utf_copy(Char, totlen, dat)
277+
flags == 0 && return fast_utf_copy(UTF32String, Char, totlen, dat)
281278
# has multi-byte UTF-8 sequences
282279
buf = Vector{Char}(totlen)
283280
@inbounds buf[totlen] = 0 # NULL termination
@@ -295,7 +292,7 @@ function convert(::Type{UTF32String}, str::UTF8String)
295292
# Handle range 0x800-0xffff
296293
elseif ch < 0xf0
297294
pos += 2
298-
ch = get_utf8_3(dat, pos, ch)
295+
ch = get_utf8_3byte(dat, pos, ch)
299296
# Handle surrogate pairs (should have been encoded in 4 bytes)
300297
if is_surrogate_lead(ch)
301298
# Build up 32-bit character from ch and trailing surrogate in next 3 bytes
@@ -309,7 +306,7 @@ function convert(::Type{UTF32String}, str::UTF8String)
309306
# Handle range 0x10000-0x10ffff
310307
else
311308
pos += 3
312-
buf[out += 1] = get_utf8_4(dat, pos, ch)
309+
buf[out += 1] = get_utf8_4byte(dat, pos, ch)
313310
end
314311
end
315312
UTF32String(buf)
@@ -367,7 +364,7 @@ function convert(::Type{UTF16String}, dat::Vector{UInt32})
367364
len, flags, num4byte = check_string_utf32(dat, len>>>2)
368365
len += num4byte + 1
369366
# optimized path, no surrogates
370-
num4byte == 0 && return fast_utf_copy(UInt16, len, dat)
367+
num4byte == 0 && return fast_utf_copy(UTF16String, UInt16, len, dat)
371368
return encode_to_utf16(dat, len)
372369
end
373370

@@ -423,22 +420,31 @@ end
423420

424421
convert(::Type{UTF8String}, dat::Vector{Char}) = convert(UTF8String, reinterpret(UInt32, dat))
425422

423+
convert(::Type{UTF16String}, str::UTF16String) = str
424+
convert(::Type{UTF16String}, dat::Vector{Char}) = convert(UTF16String, reinterpret(UInt32, dat))
425+
426426
function convert(::Type{UTF16String}, str::ASCIIString)
427427
dat = str.data
428-
fast_utf_copy(UInt16, length(dat)+1, dat)
428+
fast_utf_copy(UTF16String, UInt16, length(dat)+1, dat)
429429
end
430430

431-
function convert(::Type{UTF32String}, str::ASCIIString)
432-
dat = str.data
433-
fast_utf_copy(Char, length(dat)+1, dat)
431+
function convert(::Type{UTF16String}, data::AbstractVector{UInt16})
432+
!isvalid(UTF16String, data) && throw(ArgumentError("invalid UTF16 data"))
433+
fast_utf_copy(UTF16String, UInt16, length(data), data, true)
434434
end
435435

436-
convert(::Type{UTF16String}, str::UTF16String) = str
437-
convert(::Type{UTF16String}, dat::Vector{Char}) = convert(UTF16String, reinterpret(UInt32, dat))
438-
439436
convert(::Type{Vector{UInt16}}, str::UTF16String) = str.data
440437
convert(::Type{Array{UInt16}}, str::UTF16String) = str.data
441438

442439
convert(::Type{UTF32String}, str::UTF32String) = str
443440

444441
convert(::Type{UTF32String}, c::Char) = UTF32String(Char[c, Char(0)])
442+
443+
function convert(::Type{UTF32String}, str::ASCIIString)
444+
dat = str.data
445+
fast_utf_copy(UTF32String, Char, length(dat)+1, dat)
446+
end
447+
448+
convert(::Type{UTF32String}, dat::AbstractVector{Char}) = fast_utf_copy(UTF32String, Char, length(dat), dat, true)
449+
450+

base/utferror.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ const errMsgs = [
4343
@throws never returns, always throws ArgumentError
4444
""" ->
4545
=#
46-
function utf_errfunc(errcode::Integer, charpos, invchar)
46+
@noinline function utf_errfunc(errcode::Integer, charpos, invchar)
4747
if errcode < 1 || errcode > UTF_ERR_MAX
4848
throw(ArgumentError("Invalid error code for Unicode error: $errcode, Pos = $charpos, Char = $invchar"))
4949
end

0 commit comments

Comments
 (0)