From 63e2718123ce36147a602d5c8749555a5dcfe430 Mon Sep 17 00:00:00 2001 From: Peter Arato Date: Tue, 9 May 2023 12:00:12 -0400 Subject: [PATCH] Add String#byteindex Add String#byterindex Add tests --- CHANGELOG.md | 1 + spec/ruby/core/string/byteindex_spec.rb | 304 +++++++++++++++ spec/ruby/core/string/byterindex_spec.rb | 359 ++++++++++++++++++ .../core/string/shared/byte_index_common.rb | 63 +++ spec/truffle/methods/String.txt | 2 + spec/truffleruby.next-specs | 3 + .../truffleruby/core/string/StringNodes.java | 33 ++ src/main/ruby/truffleruby/core/string.rb | 76 ++++ .../core/truffle/polyglot_methods.rb | 8 + .../core/truffle/string_operations.rb | 11 + 10 files changed, 860 insertions(+) create mode 100644 spec/ruby/core/string/byteindex_spec.rb create mode 100644 spec/ruby/core/string/byterindex_spec.rb create mode 100644 spec/ruby/core/string/shared/byte_index_common.rb diff --git a/CHANGELOG.md b/CHANGELOG.md index cd71879c8559..27497c18b24e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ Compatibility: * Fix `Array#[]` with `ArithmeticSequence` argument when step is negative (#3039, @itarato). * Fix `Range#size` and return `nil` for beginningless Range when end isn't Numeric (#3039, @rwstauner). * Alias `String#-@` to `String#dedup` (#3039, @itarato). +* Add `String#byteindex` and `String#byterindex` (#3039, @itarato). Performance: diff --git a/spec/ruby/core/string/byteindex_spec.rb b/spec/ruby/core/string/byteindex_spec.rb new file mode 100644 index 000000000000..7be0c7ec1ef9 --- /dev/null +++ b/spec/ruby/core/string/byteindex_spec.rb @@ -0,0 +1,304 @@ +# -*- encoding: utf-8 -*- +require_relative '../../spec_helper' +require_relative 'fixtures/classes' +require_relative 'shared/byte_index_common.rb' + +describe "String#byteindex" do + ruby_version_is "3.2" do + it "calls #to_str to convert the first argument" do + char = mock("string index char") + char.should_receive(:to_str).and_return("b") + "abc".byteindex(char).should == 1 + end + + it "calls #to_int to convert the second argument" do + offset = mock("string index offset") + offset.should_receive(:to_int).and_return(1) + "abc".byteindex("c", offset).should == 2 + end + + it "does not raise IndexError when byte offset is correct or on string boundary" do + "わ".byteindex("").should == 0 + "わ".byteindex("", 0).should == 0 + "わ".byteindex("", 3).should == 3 + end + + it_behaves_like :byte_index_common, :byteindex + end +end + +describe "String#byteindex with String" do + ruby_version_is "3.2" do + it "behaves the same as String#byteindex(char) for one-character strings" do + "blablabla hello cruel world...!".split("").uniq.each do |str| + chr = str[0] + str.byteindex(str).should == str.byteindex(chr) + + 0.upto(str.size + 1) do |start| + str.byteindex(str, start).should == str.byteindex(chr, start) + end + + (-str.size - 1).upto(-1) do |start| + str.byteindex(str, start).should == str.byteindex(chr, start) + end + end + end + + it "returns the byteindex of the first occurrence of the given substring" do + "blablabla".byteindex("").should == 0 + "blablabla".byteindex("b").should == 0 + "blablabla".byteindex("bla").should == 0 + "blablabla".byteindex("blabla").should == 0 + "blablabla".byteindex("blablabla").should == 0 + + "blablabla".byteindex("l").should == 1 + "blablabla".byteindex("la").should == 1 + "blablabla".byteindex("labla").should == 1 + "blablabla".byteindex("lablabla").should == 1 + + "blablabla".byteindex("a").should == 2 + "blablabla".byteindex("abla").should == 2 + "blablabla".byteindex("ablabla").should == 2 + end + + it "treats the offset as a byteindex" do + "aaaaa".byteindex("a", 0).should == 0 + "aaaaa".byteindex("a", 2).should == 2 + "aaaaa".byteindex("a", 4).should == 4 + end + + it "ignores string subclasses" do + "blablabla".byteindex(StringSpecs::MyString.new("bla")).should == 0 + StringSpecs::MyString.new("blablabla").byteindex("bla").should == 0 + StringSpecs::MyString.new("blablabla").byteindex(StringSpecs::MyString.new("bla")).should == 0 + end + + it "starts the search at the given offset" do + "blablabla".byteindex("bl", 0).should == 0 + "blablabla".byteindex("bl", 1).should == 3 + "blablabla".byteindex("bl", 2).should == 3 + "blablabla".byteindex("bl", 3).should == 3 + + "blablabla".byteindex("bla", 0).should == 0 + "blablabla".byteindex("bla", 1).should == 3 + "blablabla".byteindex("bla", 2).should == 3 + "blablabla".byteindex("bla", 3).should == 3 + + "blablabla".byteindex("blab", 0).should == 0 + "blablabla".byteindex("blab", 1).should == 3 + "blablabla".byteindex("blab", 2).should == 3 + "blablabla".byteindex("blab", 3).should == 3 + + "blablabla".byteindex("la", 1).should == 1 + "blablabla".byteindex("la", 2).should == 4 + "blablabla".byteindex("la", 3).should == 4 + "blablabla".byteindex("la", 4).should == 4 + + "blablabla".byteindex("lab", 1).should == 1 + "blablabla".byteindex("lab", 2).should == 4 + "blablabla".byteindex("lab", 3).should == 4 + "blablabla".byteindex("lab", 4).should == 4 + + "blablabla".byteindex("ab", 2).should == 2 + "blablabla".byteindex("ab", 3).should == 5 + "blablabla".byteindex("ab", 4).should == 5 + "blablabla".byteindex("ab", 5).should == 5 + + "blablabla".byteindex("", 0).should == 0 + "blablabla".byteindex("", 1).should == 1 + "blablabla".byteindex("", 2).should == 2 + "blablabla".byteindex("", 7).should == 7 + "blablabla".byteindex("", 8).should == 8 + "blablabla".byteindex("", 9).should == 9 + end + + it "starts the search at offset + self.length if offset is negative" do + str = "blablabla" + + ["bl", "bla", "blab", "la", "lab", "ab", ""].each do |needle| + (-str.length .. -1).each do |offset| + str.byteindex(needle, offset).should == + str.byteindex(needle, offset + str.length) + end + end + end + + it "returns nil if the substring isn't found" do + "blablabla".byteindex("B").should == nil + "blablabla".byteindex("z").should == nil + "blablabla".byteindex("BLA").should == nil + "blablabla".byteindex("blablablabla").should == nil + "blablabla".byteindex("", 10).should == nil + + "hello".byteindex("he", 1).should == nil + "hello".byteindex("he", 2).should == nil + "I’ve got a multibyte character.\n".byteindex("\n\n").should == nil + end + + it "returns the character byteindex of a multibyte character" do + "ありがとう".byteindex("が").should == 6 + end + + it "returns the character byteindex after offset" do + "われわれ".byteindex("わ", 3).should == 6 + "ありがとうありがとう".byteindex("が", 9).should == 21 + end + + it "returns the character byteindex after a partial first match" do + " do + "あれ".byteindex(char) + end.should raise_error(Encoding::CompatibilityError) + end + + it "handles a substring in a superset encoding" do + 'abc'.force_encoding(Encoding::US_ASCII).byteindex('é').should == nil + end + + it "handles a substring in a subset encoding" do + 'été'.byteindex('t'.force_encoding(Encoding::US_ASCII)).should == 2 + end + end +end + +describe "String#byteindex with Regexp" do + ruby_version_is "3.2" do + it "behaves the same as String#byteindex(string) for escaped string regexps" do + ["blablabla", "hello cruel world...!"].each do |str| + ["", "b", "bla", "lab", "o c", "d."].each do |needle| + regexp = Regexp.new(Regexp.escape(needle)) + str.byteindex(regexp).should == str.byteindex(needle) + + 0.upto(str.size + 1) do |start| + str.byteindex(regexp, start).should == str.byteindex(needle, start) + end + + (-str.size - 1).upto(-1) do |start| + str.byteindex(regexp, start).should == str.byteindex(needle, start) + end + end + end + end + + it "returns the byteindex of the first match of regexp" do + "blablabla".byteindex(/bla/).should == 0 + "blablabla".byteindex(/BLA/i).should == 0 + + "blablabla".byteindex(/.{0}/).should == 0 + "blablabla".byteindex(/.{6}/).should == 0 + "blablabla".byteindex(/.{9}/).should == 0 + + "blablabla".byteindex(/.*/).should == 0 + "blablabla".byteindex(/.+/).should == 0 + + "blablabla".byteindex(/lab|b/).should == 0 + + not_supported_on :opal do + "blablabla".byteindex(/\A/).should == 0 + "blablabla".byteindex(/\Z/).should == 9 + "blablabla".byteindex(/\z/).should == 9 + "blablabla\n".byteindex(/\Z/).should == 9 + "blablabla\n".byteindex(/\z/).should == 10 + end + + "blablabla".byteindex(/^/).should == 0 + "\nblablabla".byteindex(/^/).should == 0 + "b\nablabla".byteindex(/$/).should == 1 + "bl\nablabla".byteindex(/$/).should == 2 + + "blablabla".byteindex(/.l./).should == 0 + end + + it "starts the search at the given offset" do + "blablabla".byteindex(/.{0}/, 5).should == 5 + "blablabla".byteindex(/.{1}/, 5).should == 5 + "blablabla".byteindex(/.{2}/, 5).should == 5 + "blablabla".byteindex(/.{3}/, 5).should == 5 + "blablabla".byteindex(/.{4}/, 5).should == 5 + + "blablabla".byteindex(/.{0}/, 3).should == 3 + "blablabla".byteindex(/.{1}/, 3).should == 3 + "blablabla".byteindex(/.{2}/, 3).should == 3 + "blablabla".byteindex(/.{5}/, 3).should == 3 + "blablabla".byteindex(/.{6}/, 3).should == 3 + + "blablabla".byteindex(/.l./, 0).should == 0 + "blablabla".byteindex(/.l./, 1).should == 3 + "blablabla".byteindex(/.l./, 2).should == 3 + "blablabla".byteindex(/.l./, 3).should == 3 + + "xblaxbla".byteindex(/x./, 0).should == 0 + "xblaxbla".byteindex(/x./, 1).should == 4 + "xblaxbla".byteindex(/x./, 2).should == 4 + + not_supported_on :opal do + "blablabla\n".byteindex(/\Z/, 9).should == 9 + end + end + + it "starts the search at offset + self.length if offset is negative" do + str = "blablabla" + + ["bl", "bla", "blab", "la", "lab", "ab", ""].each do |needle| + (-str.length .. -1).each do |offset| + str.byteindex(needle, offset).should == + str.byteindex(needle, offset + str.length) + end + end + end + + it "returns nil if the substring isn't found" do + "blablabla".byteindex(/BLA/).should == nil + + "blablabla".byteindex(/.{10}/).should == nil + "blaxbla".byteindex(/.x/, 3).should == nil + "blaxbla".byteindex(/..x/, 2).should == nil + end + + it "returns nil if the Regexp matches the empty string and the offset is out of range" do + "ruby".byteindex(//, 12).should be_nil + end + + it "supports \\G which matches at the given start offset" do + "helloYOU.".byteindex(/\GYOU/, 5).should == 5 + "helloYOU.".byteindex(/\GYOU/).should == nil + + re = /\G.+YOU/ + # The # marks where \G will match. + [ + ["#hi!YOUall.", 0], + ["h#i!YOUall.", 1], + ["hi#!YOUall.", 2], + ["hi!#YOUall.", nil] + ].each do |spec| + + start = spec[0].byteindex("#") + str = spec[0].delete("#") + + str.byteindex(re, start).should == spec[1] + end + end + + it "converts start_offset to an integer via to_int" do + obj = mock('1') + obj.should_receive(:to_int).and_return(1) + "RWOARW".byteindex(/R./, obj).should == 4 + end + + it "returns the character byteindex of a multibyte character" do + "ありがとう".byteindex(/が/).should == 6 + end + + it "returns the character byteindex after offset" do + "われわれ".byteindex(/わ/, 3).should == 6 + end + + it "treats the offset as a byteindex" do + "われわわれ".byteindex(/わ/, 6).should == 6 + end + end +end diff --git a/spec/ruby/core/string/byterindex_spec.rb b/spec/ruby/core/string/byterindex_spec.rb new file mode 100644 index 000000000000..717708c97d36 --- /dev/null +++ b/spec/ruby/core/string/byterindex_spec.rb @@ -0,0 +1,359 @@ +# -*- encoding: utf-8 -*- +require_relative '../../spec_helper' +require_relative 'fixtures/classes' +require_relative 'shared/byte_index_common.rb' + +describe "String#byterindex with object" do + ruby_version_is "3.2" do + it "tries to convert obj to a string via to_str" do + obj = mock('lo') + def obj.to_str() "lo" end + "hello".byterindex(obj).should == "hello".byterindex("lo") + + obj = mock('o') + def obj.respond_to?(arg, *) true end + def obj.method_missing(*args) "o" end + "hello".byterindex(obj).should == "hello".byterindex("o") + end + + it "calls #to_int to convert the second argument" do + offset = mock("string index offset") + offset.should_receive(:to_int).and_return(3) + "abc".byterindex("c", offset).should == 2 + end + + it "does not raise IndexError when byte offset is correct or on string boundary" do + "わ".byterindex("", 0).should == 0 + "わ".byterindex("", 3).should == 3 + "わ".byterindex("").should == 3 + end + + it_behaves_like :byte_index_common, :byterindex + end +end + +describe "String#byterindex with String" do + ruby_version_is "3.2" do + it "behaves the same as String#byterindex(char) for one-character strings" do + "blablabla hello cruel world...!".split("").uniq.each do |str| + chr = str[0] + str.byterindex(str).should == str.byterindex(chr) + + 0.upto(str.size + 1) do |start| + str.byterindex(str, start).should == str.byterindex(chr, start) + end + + (-str.size - 1).upto(-1) do |start| + str.byterindex(str, start).should == str.byterindex(chr, start) + end + end + end + + it "behaves the same as String#byterindex(?char) for one-character strings" do + "blablabla hello cruel world...!".split("").uniq.each do |str| + chr = str[0] =~ / / ? str[0] : eval("?#{str[0]}") + str.byterindex(str).should == str.byterindex(chr) + + 0.upto(str.size + 1) do |start| + str.byterindex(str, start).should == str.byterindex(chr, start) + end + + (-str.size - 1).upto(-1) do |start| + str.byterindex(str, start).should == str.byterindex(chr, start) + end + end + end + + it "returns the index of the last occurrence of the given substring" do + "blablabla".byterindex("").should == 9 + "blablabla".byterindex("a").should == 8 + "blablabla".byterindex("la").should == 7 + "blablabla".byterindex("bla").should == 6 + "blablabla".byterindex("abla").should == 5 + "blablabla".byterindex("labla").should == 4 + "blablabla".byterindex("blabla").should == 3 + "blablabla".byterindex("ablabla").should == 2 + "blablabla".byterindex("lablabla").should == 1 + "blablabla".byterindex("blablabla").should == 0 + + "blablabla".byterindex("l").should == 7 + "blablabla".byterindex("bl").should == 6 + "blablabla".byterindex("abl").should == 5 + "blablabla".byterindex("labl").should == 4 + "blablabla".byterindex("blabl").should == 3 + "blablabla".byterindex("ablabl").should == 2 + "blablabla".byterindex("lablabl").should == 1 + "blablabla".byterindex("blablabl").should == 0 + + "blablabla".byterindex("b").should == 6 + "blablabla".byterindex("ab").should == 5 + "blablabla".byterindex("lab").should == 4 + "blablabla".byterindex("blab").should == 3 + "blablabla".byterindex("ablab").should == 2 + "blablabla".byterindex("lablab").should == 1 + "blablabla".byterindex("blablab").should == 0 + end + + it "ignores string subclasses" do + "blablabla".byterindex(StringSpecs::MyString.new("bla")).should == 6 + StringSpecs::MyString.new("blablabla").byterindex("bla").should == 6 + StringSpecs::MyString.new("blablabla").byterindex(StringSpecs::MyString.new("bla")).should == 6 + end + + it "starts the search at the given offset" do + "blablabla".byterindex("bl", 0).should == 0 + "blablabla".byterindex("bl", 1).should == 0 + "blablabla".byterindex("bl", 2).should == 0 + "blablabla".byterindex("bl", 3).should == 3 + + "blablabla".byterindex("bla", 0).should == 0 + "blablabla".byterindex("bla", 1).should == 0 + "blablabla".byterindex("bla", 2).should == 0 + "blablabla".byterindex("bla", 3).should == 3 + + "blablabla".byterindex("blab", 0).should == 0 + "blablabla".byterindex("blab", 1).should == 0 + "blablabla".byterindex("blab", 2).should == 0 + "blablabla".byterindex("blab", 3).should == 3 + "blablabla".byterindex("blab", 6).should == 3 + "blablablax".byterindex("blab", 6).should == 3 + + "blablabla".byterindex("la", 1).should == 1 + "blablabla".byterindex("la", 2).should == 1 + "blablabla".byterindex("la", 3).should == 1 + "blablabla".byterindex("la", 4).should == 4 + + "blablabla".byterindex("lab", 1).should == 1 + "blablabla".byterindex("lab", 2).should == 1 + "blablabla".byterindex("lab", 3).should == 1 + "blablabla".byterindex("lab", 4).should == 4 + + "blablabla".byterindex("ab", 2).should == 2 + "blablabla".byterindex("ab", 3).should == 2 + "blablabla".byterindex("ab", 4).should == 2 + "blablabla".byterindex("ab", 5).should == 5 + + "blablabla".byterindex("", 0).should == 0 + "blablabla".byterindex("", 1).should == 1 + "blablabla".byterindex("", 2).should == 2 + "blablabla".byterindex("", 7).should == 7 + "blablabla".byterindex("", 8).should == 8 + "blablabla".byterindex("", 9).should == 9 + "blablabla".byterindex("", 10).should == 9 + end + + it "starts the search at offset + self.length if offset is negative" do + str = "blablabla" + + ["bl", "bla", "blab", "la", "lab", "ab", ""].each do |needle| + (-str.length .. -1).each do |offset| + str.byterindex(needle, offset).should == + str.byterindex(needle, offset + str.length) + end + end + end + + it "returns nil if the substring isn't found" do + "blablabla".byterindex("B").should == nil + "blablabla".byterindex("z").should == nil + "blablabla".byterindex("BLA").should == nil + "blablabla".byterindex("blablablabla").should == nil + + "hello".byterindex("lo", 0).should == nil + "hello".byterindex("lo", 1).should == nil + "hello".byterindex("lo", 2).should == nil + + "hello".byterindex("llo", 0).should == nil + "hello".byterindex("llo", 1).should == nil + + "hello".byterindex("el", 0).should == nil + "hello".byterindex("ello", 0).should == nil + + "hello".byterindex("", -6).should == nil + "hello".byterindex("", -7).should == nil + + "hello".byterindex("h", -6).should == nil + end + + it "tries to convert start_offset to an integer via to_int" do + obj = mock('5') + def obj.to_int() 5 end + "str".byterindex("st", obj).should == 0 + + obj = mock('5') + def obj.respond_to?(arg, *) true end + def obj.method_missing(*args) 5 end + "str".byterindex("st", obj).should == 0 + end + + it "raises a TypeError when given offset is nil" do + -> { "str".byterindex("st", nil) }.should raise_error(TypeError) + end + + it "handles a substring in a superset encoding" do + 'abc'.force_encoding(Encoding::US_ASCII).byterindex('é').should == nil + end + + it "handles a substring in a subset encoding" do + 'été'.byterindex('t'.force_encoding(Encoding::US_ASCII)).should == 2 + end + end +end + +describe "String#byterindex with Regexp" do + ruby_version_is "3.2" do + it "behaves the same as String#byterindex(string) for escaped string regexps" do + ["blablabla", "hello cruel world...!"].each do |str| + ["", "b", "bla", "lab", "o c", "d."].each do |needle| + regexp = Regexp.new(Regexp.escape(needle)) + str.byterindex(regexp).should == str.byterindex(needle) + + 0.upto(str.size + 1) do |start| + str.byterindex(regexp, start).should == str.byterindex(needle, start) + end + + (-str.size - 1).upto(-1) do |start| + str.byterindex(regexp, start).should == str.byterindex(needle, start) + end + end + end + end + + it "returns the index of the first match from the end of string of regexp" do + "blablabla".byterindex(/bla/).should == 6 + "blablabla".byterindex(/BLA/i).should == 6 + + "blablabla".byterindex(/.{0}/).should == 9 + "blablabla".byterindex(/.{1}/).should == 8 + "blablabla".byterindex(/.{2}/).should == 7 + "blablabla".byterindex(/.{6}/).should == 3 + "blablabla".byterindex(/.{9}/).should == 0 + + "blablabla".byterindex(/.*/).should == 9 + "blablabla".byterindex(/.+/).should == 8 + + "blablabla".byterindex(/bla|a/).should == 8 + + not_supported_on :opal do + "blablabla".byterindex(/\A/).should == 0 + "blablabla".byterindex(/\Z/).should == 9 + "blablabla".byterindex(/\z/).should == 9 + "blablabla\n".byterindex(/\Z/).should == 10 + "blablabla\n".byterindex(/\z/).should == 10 + end + + "blablabla".byterindex(/^/).should == 0 + not_supported_on :opal do + "\nblablabla".byterindex(/^/).should == 1 + "b\nlablabla".byterindex(/^/).should == 2 + end + "blablabla".byterindex(/$/).should == 9 + + "blablabla".byterindex(/.l./).should == 6 + end + + it "starts the search at the given offset" do + "blablabla".byterindex(/.{0}/, 5).should == 5 + "blablabla".byterindex(/.{1}/, 5).should == 5 + "blablabla".byterindex(/.{2}/, 5).should == 5 + "blablabla".byterindex(/.{3}/, 5).should == 5 + "blablabla".byterindex(/.{4}/, 5).should == 5 + + "blablabla".byterindex(/.{0}/, 3).should == 3 + "blablabla".byterindex(/.{1}/, 3).should == 3 + "blablabla".byterindex(/.{2}/, 3).should == 3 + "blablabla".byterindex(/.{5}/, 3).should == 3 + "blablabla".byterindex(/.{6}/, 3).should == 3 + + "blablabla".byterindex(/.l./, 0).should == 0 + "blablabla".byterindex(/.l./, 1).should == 0 + "blablabla".byterindex(/.l./, 2).should == 0 + "blablabla".byterindex(/.l./, 3).should == 3 + + "blablablax".byterindex(/.x/, 10).should == 8 + "blablablax".byterindex(/.x/, 9).should == 8 + "blablablax".byterindex(/.x/, 8).should == 8 + + "blablablax".byterindex(/..x/, 10).should == 7 + "blablablax".byterindex(/..x/, 9).should == 7 + "blablablax".byterindex(/..x/, 8).should == 7 + "blablablax".byterindex(/..x/, 7).should == 7 + + not_supported_on :opal do + "blablabla\n".byterindex(/\Z/, 9).should == 9 + end + end + + it "starts the search at offset + self.length if offset is negative" do + str = "blablabla" + + ["bl", "bla", "blab", "la", "lab", "ab", ""].each do |needle| + (-str.length .. -1).each do |offset| + str.byterindex(needle, offset).should == + str.byterindex(needle, offset + str.length) + end + end + end + + it "returns nil if the substring isn't found" do + "blablabla".byterindex(/BLA/).should == nil + "blablabla".byterindex(/.{10}/).should == nil + "blablablax".byterindex(/.x/, 7).should == nil + "blablablax".byterindex(/..x/, 6).should == nil + + not_supported_on :opal do + "blablabla".byterindex(/\Z/, 5).should == nil + "blablabla".byterindex(/\z/, 5).should == nil + "blablabla\n".byterindex(/\z/, 9).should == nil + end + end + + not_supported_on :opal do + it "supports \\G which matches at the given start offset" do + "helloYOU.".byterindex(/YOU\G/, 8).should == 5 + "helloYOU.".byterindex(/YOU\G/).should == nil + + idx = "helloYOUall!".index("YOU") + re = /YOU.+\G.+/ + # The # marks where \G will match. + [ + ["helloYOU#all.", nil], + ["helloYOUa#ll.", idx], + ["helloYOUal#l.", idx], + ["helloYOUall#.", idx], + ["helloYOUall.#", nil] + ].each do |i| + start = i[0].index("#") + str = i[0].delete("#") + + str.byterindex(re, start).should == i[1] + end + end + end + + it "tries to convert start_offset to an integer" do + obj = mock('5') + def obj.to_int() 5 end + "str".byterindex(/../, obj).should == 1 + + obj = mock('5') + def obj.respond_to?(arg, *) true end + def obj.method_missing(*args); 5; end + "str".byterindex(/../, obj).should == 1 + end + + it "raises a TypeError when given offset is nil" do + -> { "str".byterindex(/../, nil) }.should raise_error(TypeError) + end + + it "returns the reverse byte index of a multibyte character" do + "ありがりがとう".byterindex("が").should == 12 + "ありがりがとう".byterindex(/が/).should == 12 + end + + it "returns the character index before the finish" do + "ありがりがとう".byterindex("が", 9).should == 6 + "ありがりがとう".byterindex(/が/, 9).should == 6 + end + end +end diff --git a/spec/ruby/core/string/shared/byte_index_common.rb b/spec/ruby/core/string/shared/byte_index_common.rb new file mode 100644 index 000000000000..fa73e291ede5 --- /dev/null +++ b/spec/ruby/core/string/shared/byte_index_common.rb @@ -0,0 +1,63 @@ +# -*- encoding: utf-8 -*- +require_relative '../../../spec_helper' + +describe :byte_index_common, shared: true do + describe "raises on type errors" do + it "raises a TypeError if passed nil" do + -> { "abc".send(@method, nil) }.should raise_error(TypeError, "no implicit conversion of nil into String") + end + + it "raises a TypeError if passed a boolean" do + -> { "abc".send(@method, true) }.should raise_error(TypeError, "no implicit conversion of true into String") + end + + it "raises a TypeError if passed a Symbol" do + not_supported_on :opal do + -> { "abc".send(@method, :a) }.should raise_error(TypeError, "no implicit conversion of Symbol into String") + end + end + + it "raises a TypeError if passed a Symbol" do + obj = mock('x') + obj.should_not_receive(:to_int) + -> { "hello".send(@method, obj) }.should raise_error(TypeError, "no implicit conversion of MockObject into String") + end + + it "raises a TypeError if passed an Integer" do + -> { "abc".send(@method, 97) }.should raise_error(TypeError, "no implicit conversion of Integer into String") + end + end + + describe "with multibyte codepoints" do + it "raises an IndexError when byte offset lands in the middle of a multibyte character" do + -> { "わ".send(@method, "", 1) }.should raise_error(IndexError, "offset 1 does not land on character boundary") + -> { "わ".send(@method, "", 2) }.should raise_error(IndexError, "offset 2 does not land on character boundary") + -> { "わ".send(@method, "", -1) }.should raise_error(IndexError, "offset 2 does not land on character boundary") + -> { "わ".send(@method, "", -2) }.should raise_error(IndexError, "offset 1 does not land on character boundary") + end + + it "raises an Encoding::CompatibilityError if the encodings are incompatible" do + re = Regexp.new "れ".encode(Encoding::EUC_JP) + -> do + "あれ".send(@method, re) + end.should raise_error(Encoding::CompatibilityError, "incompatible character encodings: UTF-8 and EUC-JP") + end + end + + describe "with global variables" do + it "doesn't set $~ for non regex search" do + $~ = nil + + 'hello.'.send(@method, 'll') + $~.should == nil + end + + it "sets $~ to MatchData of match and nil when there's none" do + 'hello.'.send(@method, /.e./) + $~[0].should == 'hel' + + 'hello.'.send(@method, /not/) + $~.should == nil + end + end +end diff --git a/spec/truffle/methods/String.txt b/spec/truffle/methods/String.txt index 05427da07527..51a5ad3aff23 100644 --- a/spec/truffle/methods/String.txt +++ b/spec/truffle/methods/String.txt @@ -12,6 +12,8 @@ []= ascii_only? b +byteindex +byterindex bytes bytesize byteslice diff --git a/spec/truffleruby.next-specs b/spec/truffleruby.next-specs index 39d167983f08..051c2e10b79a 100644 --- a/spec/truffleruby.next-specs +++ b/spec/truffleruby.next-specs @@ -16,3 +16,6 @@ spec/ruby/core/hash/shift_spec.rb spec/ruby/core/range/size_spec.rb spec/ruby/core/string/dedup_spec.rb + +spec/ruby/core/string/byteindex_spec.rb +spec/ruby/core/string/byterindex_spec.rb diff --git a/src/main/java/org/truffleruby/core/string/StringNodes.java b/src/main/java/org/truffleruby/core/string/StringNodes.java index 2ff5d3ce3787..67a4015ad05b 100644 --- a/src/main/java/org/truffleruby/core/string/StringNodes.java +++ b/src/main/java/org/truffleruby/core/string/StringNodes.java @@ -3957,6 +3957,39 @@ protected Object stringByteIndex( } } + /** Search pattern in string starting at offset bytes backwards, and return a byte index or nil */ + @Primitive(name = "string_byte_reverse_index", lowerFixnum = 3) + public abstract static class StringByteReverseIndexNode extends PrimitiveArrayArgumentsNode { + @Specialization + protected Object stringByteIndex( + Object rubyString, Object rubyPattern, RubyEncoding compatibleEncoding, int byteOffset, + @Cached RubyStringLibrary libString, + @Cached RubyStringLibrary libPattern, + @Cached TruffleString.LastByteIndexOfStringNode lastByteIndexOfStringNode, + @Cached ConditionProfile indexOutOfBoundsProfile, + @Cached ConditionProfile foundProfile) { + assert byteOffset >= 0; + + var string = libString.getTString(rubyString); + int stringByteLength = libString.byteLength(rubyString); + + var pattern = libPattern.getTString(rubyPattern); + int patternByteLength = libPattern.byteLength(rubyPattern); + + if (indexOutOfBoundsProfile.profile(patternByteLength > stringByteLength)) { + return nil; + } + + int found = lastByteIndexOfStringNode.execute(string, pattern, byteOffset, 0, + compatibleEncoding.tencoding); + if (foundProfile.profile(found >= 0)) { + return found; + } + + return nil; + } + } + // Port of Rubinius's String::previous_byte_index. // // This method takes a byte index, finds the corresponding character the byte index belongs to, and then returns diff --git a/src/main/ruby/truffleruby/core/string.rb b/src/main/ruby/truffleruby/core/string.rb index 6ab80670b0a8..a0ae08f64553 100644 --- a/src/main/ruby/truffleruby/core/string.rb +++ b/src/main/ruby/truffleruby/core/string.rb @@ -1097,6 +1097,82 @@ def rindex(sub, finish = undefined) nil end + def byteindex(str, start = undefined) + is_regex_pattern = Primitive.is_a?(str, Regexp) + + if Primitive.undefined?(start) + start = 0 + else + start = Primitive.rb_to_int(start) + + start += bytesize if start < 0 + if start < 0 || start > bytesize + if is_regex_pattern + Primitive.regexp_last_match_set(Primitive.caller_special_variables, nil) + end + + return nil + end + end + + unless Truffle::StringOperations.on_codepoint_boundary?(self, start) + raise IndexError, "offset #{start} does not land on character boundary" + end + + if is_regex_pattern + Primitive.encoding_ensure_compatible(self, str) + + match = Truffle::RegexpOperations.match_from(str, self, start) + Primitive.regexp_last_match_set(Primitive.caller_special_variables, match) + return match ? Primitive.character_index_to_byte_index(self, match.begin(0)) : nil + end + + str = StringValue(str) + return start if str.empty? + return nil if start + str.bytesize > bytesize + + enc = Primitive.encoding_ensure_compatible_str(self, str) + Primitive.string_byte_index(self, str, enc, start) + end + + def byterindex(str, finish = undefined) + if Primitive.undefined?(finish) + finish = bytesize + else + finish = Primitive.rb_to_int(finish) + finish += bytesize if finish < 0 + return nil if finish < 0 + + finish = bytesize if finish > bytesize + end + + unless Truffle::StringOperations.on_codepoint_boundary?(self, finish) + raise IndexError, "offset #{finish} does not land on character boundary" + end + + if Primitive.is_a?(str, Regexp) + Primitive.encoding_ensure_compatible(self, str) + + match = Truffle::RegexpOperations.search_region(str, self, 0, finish, false, true) + Primitive.regexp_last_match_set(Primitive.caller_special_variables, match) + return match ? Primitive.character_index_to_byte_index(self, match.begin(0)) : nil + end + + str = StringValue(str) + return finish if str.empty? + return nil if str.bytesize > bytesize + + # Add `str.size` worth of bytes to `finish` to compensate for `LastByteIndexOfStringNode` which does a reverse + # search including the full pattern length (as start pos). + finish_adjusted = Primitive.byte_index_to_character_index(self, finish) + finish_adjusted += str.size + finish_adjusted = size if finish_adjusted > size + finish = Primitive.character_index_to_byte_index(self, finish_adjusted) + + enc = Primitive.encoding_ensure_compatible_str(self, str) + Primitive.string_byte_reverse_index(self, str, enc, finish) + end + def start_with?(*prefixes) if prefixes.size == 1 and prefix = prefixes[0] and Primitive.is_a?(prefix, String) enc = Primitive.encoding_ensure_compatible_str self, prefix diff --git a/src/main/ruby/truffleruby/core/truffle/polyglot_methods.rb b/src/main/ruby/truffleruby/core/truffle/polyglot_methods.rb index cb988edd8006..92dda0d54e7a 100644 --- a/src/main/ruby/truffleruby/core/truffle/polyglot_methods.rb +++ b/src/main/ruby/truffleruby/core/truffle/polyglot_methods.rb @@ -68,6 +68,14 @@ def b(...) to_s.b(...) end + def byteindex(...) + to_s.byteindex(...) + end + + def byterindex(...) + to_s.byterindex(...) + end + def bytes(...) to_s.bytes(...) end diff --git a/src/main/ruby/truffleruby/core/truffle/string_operations.rb b/src/main/ruby/truffleruby/core/truffle/string_operations.rb index 8e6cee160eb6..ea355f641465 100644 --- a/src/main/ruby/truffleruby/core/truffle/string_operations.rb +++ b/src/main/ruby/truffleruby/core/truffle/string_operations.rb @@ -411,5 +411,16 @@ def self.assign_regexp(string, index, count, replacement) Primitive.string_splice(string, replacement, bi, bs, enc) end + + def self.on_codepoint_boundary?(string, byte_pos) + char_pos = Primitive.byte_index_to_character_index(string, byte_pos) + adjusted_byte_pos = if char_pos >= string.size + string.bytesize + else + Primitive.character_index_to_byte_index(string, char_pos) + end + + byte_pos == adjusted_byte_pos + end end end