-
-
Notifications
You must be signed in to change notification settings - Fork 21.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Optimize String::copy_from
and String::copy_from_unchecked
implementations, improving String allocation speed.
#99816
Optimize String::copy_from
and String::copy_from_unchecked
implementations, improving String allocation speed.
#99816
Conversation
e1d663c
to
4c2505f
Compare
Depending on the compiler, this change gains an estimated 15% speed for medium length Strings, and ~5% for large length Strings like from Synthetic Benchmark: https://onlinegdb.com/eEJ7-vImD |
87bf745
to
2d46321
Compare
Edit: I previously had another benchmark here, which was either outdated or invalid. Either way, I cannot recreate it. Instead, here's a newer one that's definitely reproducible: static const char _vertex_code[]={
R"<!>(
#ifdef USE_ATTRIBUTES
layout(location = 0) in vec2 vertex_attrib;
layout(location = 3) in vec4 color_attrib;
layout(location = 4) in vec2 uv_attrib;
#ifdef USE_INSTANCING
layout(location = 1) in highp vec4 instance_xform0;
layout(location = 2) in highp vec4 instance_xform1;
layout(location = 5) in highp uvec4 instance_color_custom_data; // Color packed into xy, custom_data packed into zw for compatibility with 3D
#endif // USE_INSTANCING
#endif // USE_ATTRIBUTES
// Compatibility renames. These are exposed with the "godot_" prefix
// to work around two distinct Adreno bugs:
// 1. Some Adreno devices expose ES310 functions in ES300 shaders.
// Internally, we must use the "godot_" prefix, but user shaders
// will be mapped automatically.
// 2. Adreno 3XX devices have poor implementations of the other packing
// functions, so we just use our own there to keep it simple.
#ifdef USE_HALF2FLOAT
// Floating point pack/unpack functions are part of the GLSL ES 300 specification used by web and mobile.
// It appears to be safe to expose these on mobile, but when running through ANGLE this appears to break.
uint float2half(uint f) {
uint e = f & uint(0x7f800000);
if (e <= uint(0x38000000)) {
return uint(0);
} else {
return ((f >> uint(16)) & uint(0x8000)) |
(((e - uint(0x38000000)) >> uint(13)) & uint(0x7c00)) |
((f >> uint(13)) & uint(0x03ff));
}
}
uint half2float(uint h) {
uint h_e = h & uint(0x7c00);
return ((h & uint(0x8000)) << uint(16)) | uint((h_e >> uint(10)) != uint(0)) * (((h_e + uint(0x1c000)) << uint(13)) | ((h & uint(0x03ff)) << uint(13)));
}
uint godot_packHalf2x16(vec2 v) {
return float2half(floatBitsToUint(v.x)) | float2half(floatBitsToUint(v.y)) << uint(16);
}
vec2 godot_unpackHalf2x16(uint v) {
return vec2(uintBitsToFloat(half2float(v & uint(0xffff))),
uintBitsToFloat(half2float(v >> uint(16))));
}
uint godot_packUnorm2x16(vec2 v) {
uvec2 uv = uvec2(round(clamp(v, vec2(0.0), vec2(1.0)) * 65535.0));
return uv.x | uv.y << uint(16);
}
vec2 godot_unpackUnorm2x16(uint p) {
return vec2(float(p & uint(0xffff)), float(p >> uint(16))) * 0.000015259021; // 1.0 / 65535.0 optimization
}
uint godot_packSnorm2x16(vec2 v) {
uvec2 uv = uvec2(round(clamp(v, vec2(-1.0), vec2(1.0)) * 32767.0) + 32767.0);
return uv.x | uv.y << uint(16);
}
vec2 godot_unpackSnorm2x16(uint p) {
vec2 v = vec2(float(p & uint(0xffff)), float(p >> uint(16)));
return clamp((v - 32767.0) * vec2(0.00003051851), vec2(-1.0), vec2(1.0));
}
#define packHalf2x16 godot_packHalf2x16
#define unpackHalf2x16 godot_unpackHalf2x16
#define packUnorm2x16 godot_packUnorm2x16
#define unpackUnorm2x16 godot_unpackUnorm2x16
#define packSnorm2x16 godot_packSnorm2x16
#define unpackSnorm2x16 godot_unpackSnorm2x16
#endif // USE_HALF2FLOAT
// Always expose these as they are ES310 functions and not available in ES300 or GLSL 330.
uint godot_packUnorm4x8(vec4 v) {
uvec4 uv = uvec4(round(clamp(v, vec4(0.0), vec4(1.0)) * 255.0));
return uv.x | (uv.y << uint(8)) | (uv.z << uint(16)) | (uv.w << uint(24));
}
vec4 godot_unpackUnorm4x8(uint p) {
return vec4(float(p & uint(0xff)), float((p >> uint(8)) & uint(0xff)), float((p >> uint(16)) & uint(0xff)), float(p >> uint(24))) * 0.00392156862; // 1.0 / 255.0
}
uint godot_packSnorm4x8(vec4 v) {
uvec4 uv = uvec4(round(clamp(v, vec4(-1.0), vec4(1.0)) * 127.0) + 127.0);
return uv.x | uv.y << uint(8) | uv.z << uint(16) | uv.w << uint(24);
}
vec4 godot_unpackSnorm4x8(uint p) {
vec4 v = vec4(float(p & uint(0xff)), float((p >> uint(8)) & uint(0xff)), float((p >> uint(16)) & uint(0xff)), float(p >> uint(24)));
return clamp((v - vec4(127.0)) * vec4(0.00787401574), vec4(-1.0), vec4(1.0));
}
#define packUnorm4x8 godot_packUnorm4x8
#define unpackUnorm4x8 godot_unpackUnorm4x8
#define packSnorm4x8 godot_packSnorm4x8
#define unpackSnorm4x8 godot_unpackSnorm4x8
#if defined(CUSTOM0_USED)
layout(location = 6) in highp vec4 custom0_attrib;
#endif
#if defined(CUSTOM1_USED)
layout(location = 7) in highp vec4 custom1_attrib;
#endif
layout(location = 8) in highp vec4 attrib_A;
layout(location = 9) in highp vec4 attrib_B;
layout(location = 10) in highp vec4 attrib_C;
layout(location = 11) in highp vec4 attrib_D;
layout(location = 12) in highp vec4 attrib_E;
#ifdef USE_PRIMITIVE
layout(location = 13) in highp uvec4 attrib_F;
#else
layout(location = 13) in highp vec4 attrib_F;
#endif
layout(location = 14) in highp uvec4 attrib_G;
layout(location = 15) in highp uvec4 attrib_H;
#define read_draw_data_world_x attrib_A.xy
#define read_draw_data_world_y attrib_A.zw
#define read_draw_data_world_ofs attrib_B.xy
#define read_draw_data_color_texture_pixel_size attrib_B.zw
#ifdef USE_PRIMITIVE
#define read_draw_data_point_a attrib_C.xy
#define read_draw_data_point_b attrib_C.zw
#define read_draw_data_point_c attrib_D.xy
#define read_draw_data_uv_a attrib_D.zw
#define read_draw_data_uv_b attrib_E.xy
#define read_draw_data_uv_c attrib_E.zw
#define read_draw_data_color_a_rg attrib_F.x
#define read_draw_data_color_a_ba attrib_F.y
#define read_draw_data_color_b_rg attrib_F.z
#define read_draw_data_color_b_ba attrib_F.w
#define read_draw_data_color_c_rg attrib_G.x
#define read_draw_data_color_c_ba attrib_G.y
#else
#define read_draw_data_modulation attrib_C
#define read_draw_data_ninepatch_margins attrib_D
#define read_draw_data_dst_rect attrib_E
#define read_draw_data_src_rect attrib_F
#endif
#define read_draw_data_flags attrib_G.z
#define read_draw_data_specular_shininess attrib_G.w
#define read_draw_data_lights attrib_H
// Varyings so the per-instance info can be used in the fragment shader
flat out vec4 varying_A;
flat out vec2 varying_B;
#ifndef USE_PRIMITIVE
flat out vec4 varying_C;
#ifndef USE_ATTRIBUTES
#ifdef USE_NINEPATCH
flat out vec2 varying_D;
#endif
flat out vec4 varying_E;
#endif
#endif
flat out uvec2 varying_F;
flat out uvec4 varying_G;
// This needs to be outside clang-format so the ubo comment is in the right place
#ifdef MATERIAL_UNIFORMS_USED
layout(std140) uniform MaterialUniforms{ //ubo:4
#MATERIAL_UNIFORMS
};
#endif
uniform mediump uint batch_flags;
/* clang-format on */
#define MAX_LIGHTS_PER_ITEM uint(16)
#define M_PI 3.14159265359
#define SDF_MAX_LENGTH 16384.0
#define INSTANCE_FLAGS_LIGHT_COUNT_SHIFT 0 // 4 bits.
#define INSTANCE_FLAGS_CLIP_RECT_UV uint(1 << 4)
#define INSTANCE_FLAGS_TRANSPOSE_RECT uint(1 << 5)
#define INSTANCE_FLAGS_USE_MSDF uint(1 << 6)
#define INSTANCE_FLAGS_USE_LCD uint(1 << 7)
#define INSTANCE_FLAGS_NINEPATCH_DRAW_CENTER uint(1 << 8)
#define INSTANCE_FLAGS_NINEPATCH_H_MODE_SHIFT 9
#define INSTANCE_FLAGS_NINEPATCH_V_MODE_SHIFT 11
#define INSTANCE_FLAGS_SHADOW_MASKED_SHIFT 13u // 16 bits.
#define INSTANCE_FLAGS_SHADOW_MASKED uint(1 << INSTANCE_FLAGS_SHADOW_MASKED_SHIFT)
// 1 means enabled, 2+ means trails in use
#define BATCH_FLAGS_INSTANCING_MASK uint(0x7F)
#define BATCH_FLAGS_INSTANCING_HAS_COLORS_SHIFT 7
#define BATCH_FLAGS_INSTANCING_HAS_COLORS uint(1 << BATCH_FLAGS_INSTANCING_HAS_COLORS_SHIFT)
#define BATCH_FLAGS_INSTANCING_HAS_CUSTOM_DATA_SHIFT 8
#define BATCH_FLAGS_INSTANCING_HAS_CUSTOM_DATA uint(1 << BATCH_FLAGS_INSTANCING_HAS_CUSTOM_DATA_SHIFT)
#define BATCH_FLAGS_DEFAULT_NORMAL_MAP_USED uint(1 << 9)
#define BATCH_FLAGS_DEFAULT_SPECULAR_MAP_USED uint(1 << 10)
layout(std140) uniform GlobalShaderUniformData { //ubo:1
vec4 global_shader_uniforms[MAX_GLOBAL_SHADER_UNIFORMS];
};
layout(std140) uniform CanvasData { //ubo:0
mat4 canvas_transform;
mat4 screen_transform;
mat4 canvas_normal_transform;
vec4 canvas_modulation;
vec2 screen_pixel_size;
float time;
bool use_pixel_snap;
vec4 sdf_to_tex;
vec2 screen_to_sdf;
vec2 sdf_to_screen;
uint directional_light_count;
float tex_to_sdf;
uint pad1;
uint pad2;
};
#ifndef DISABLE_LIGHTING
#define LIGHT_FLAGS_BLEND_MASK uint(3 << 16)
#define LIGHT_FLAGS_BLEND_MODE_ADD uint(0 << 16)
#define LIGHT_FLAGS_BLEND_MODE_SUB uint(1 << 16)
#define LIGHT_FLAGS_BLEND_MODE_MIX uint(2 << 16)
#define LIGHT_FLAGS_BLEND_MODE_MASK uint(3 << 16)
#define LIGHT_FLAGS_HAS_SHADOW uint(1 << 20)
#define LIGHT_FLAGS_FILTER_SHIFT 22
#define LIGHT_FLAGS_FILTER_MASK uint(3 << 22)
#define LIGHT_FLAGS_SHADOW_NEAREST uint(0 << 22)
#define LIGHT_FLAGS_SHADOW_PCF5 uint(1 << 22)
#define LIGHT_FLAGS_SHADOW_PCF13 uint(2 << 22)
struct Light {
mat2x4 texture_matrix; //light to texture coordinate matrix (transposed)
mat2x4 shadow_matrix; //light to shadow coordinate matrix (transposed)
vec4 color;
uint shadow_color; // packed
uint flags; //index to light texture
float shadow_pixel_size;
float height;
vec2 position;
float shadow_zfar_inv;
float shadow_y_ofs;
vec4 atlas_rect;
};
layout(std140) uniform LightData { //ubo:2
Light light_array[MAX_LIGHTS];
};
#endif // DISABLE_LIGHTING
out vec2 uv_interp;
out vec4 color_interp;
out vec2 vertex_interp;
#ifdef USE_NINEPATCH
out vec2 pixel_size_interp;
#endif
#GLOBALS
void main() {
varying_A = vec4(read_draw_data_world_x, read_draw_data_world_y);
varying_B = read_draw_data_color_texture_pixel_size;
#ifndef USE_PRIMITIVE
varying_C = read_draw_data_ninepatch_margins;
#ifndef USE_ATTRIBUTES
#ifdef USE_NINEPATCH
varying_D = vec2(read_draw_data_dst_rect.z, read_draw_data_dst_rect.w);
#endif // USE_NINEPATCH
varying_E = read_draw_data_src_rect;
#endif // !USE_ATTRIBUTES
#endif // USE_PRIMITIVE
varying_F = uvec2(read_draw_data_flags, read_draw_data_specular_shininess);
varying_G = read_draw_data_lights;
vec4 instance_custom = vec4(0.0);
#if defined(CUSTOM0_USED)
vec4 custom0 = vec4(0.0);
#endif
#if defined(CUSTOM1_USED)
vec4 custom1 = vec4(0.0);
#endif
#ifdef USE_PRIMITIVE
vec2 vertex;
vec2 uv;
vec4 color;
if (gl_VertexID % 3 == 0) {
vertex = read_draw_data_point_a;
uv = read_draw_data_uv_a;
color.xy = unpackHalf2x16(read_draw_data_color_a_rg);
color.zw = unpackHalf2x16(read_draw_data_color_a_ba);
} else if (gl_VertexID % 3 == 1) {
vertex = read_draw_data_point_b;
uv = read_draw_data_uv_b;
color.xy = unpackHalf2x16(read_draw_data_color_b_rg);
color.zw = unpackHalf2x16(read_draw_data_color_b_ba);
} else {
vertex = read_draw_data_point_c;
uv = read_draw_data_uv_c;
color.xy = unpackHalf2x16(read_draw_data_color_c_rg);
color.zw = unpackHalf2x16(read_draw_data_color_c_ba);
}
#elif defined(USE_ATTRIBUTES)
vec2 vertex = vertex_attrib;
vec4 color = color_attrib * read_draw_data_modulation;
vec2 uv = uv_attrib;
#ifdef USE_INSTANCING
if (bool(batch_flags & BATCH_FLAGS_INSTANCING_HAS_COLORS)) {
vec4 instance_color;
instance_color.xy = unpackHalf2x16(uint(instance_color_custom_data.x));
instance_color.zw = unpackHalf2x16(uint(instance_color_custom_data.y));
color *= instance_color;
}
if (bool(batch_flags & BATCH_FLAGS_INSTANCING_HAS_CUSTOM_DATA)) {
instance_custom.xy = unpackHalf2x16(instance_color_custom_data.z);
instance_custom.zw = unpackHalf2x16(instance_color_custom_data.w);
}
#endif // !USE_INSTANCING
#else // !USE_ATTRIBUTES
// crash on Adreno 320/330
//vec2 vertex_base_arr[6] = vec2[](vec2(0.0, 0.0), vec2(0.0, 1.0), vec2(1.0, 1.0), vec2(1.0, 0.0), vec2(0.0, 0.0), vec2(1.0, 1.0));
//vec2 vertex_base = vertex_base_arr[gl_VertexID % 6];
//-----------------------------------------
// ID | 0 | 1 | 2 | 3 | 4 | 5 |
//-----------------------------------------
// X | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 1.0 |
// Y | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 |
//-----------------------------------------
// no crash or freeze on all Adreno 3xx with 'if / else if' and slightly faster!
int vertex_id = gl_VertexID % 6;
vec2 vertex_base;
if (vertex_id == 0)
vertex_base = vec2(0.0, 0.0);
else if (vertex_id == 1)
vertex_base = vec2(0.0, 1.0);
else if (vertex_id == 2)
vertex_base = vec2(1.0, 1.0);
else if (vertex_id == 3)
vertex_base = vec2(1.0, 0.0);
else if (vertex_id == 4)
vertex_base = vec2(0.0, 0.0);
else if (vertex_id == 5)
vertex_base = vec2(1.0, 1.0);
vec2 uv = read_draw_data_src_rect.xy + abs(read_draw_data_src_rect.zw) * ((read_draw_data_flags & INSTANCE_FLAGS_TRANSPOSE_RECT) != uint(0) ? vertex_base.yx : vertex_base.xy);
vec4 color = read_draw_data_modulation;
vec2 vertex = read_draw_data_dst_rect.xy + abs(read_draw_data_dst_rect.zw) * mix(vertex_base, vec2(1.0, 1.0) - vertex_base, lessThan(read_draw_data_src_rect.zw, vec2(0.0, 0.0)));
#endif // USE_ATTRIBUTES
#if defined(CUSTOM0_USED)
custom0 = custom0_attrib;
#endif
#if defined(CUSTOM1_USED)
custom1 = custom1_attrib;
#endif
mat4 model_matrix = mat4(vec4(read_draw_data_world_x, 0.0, 0.0), vec4(read_draw_data_world_y, 0.0, 0.0), vec4(0.0, 0.0, 1.0, 0.0), vec4(read_draw_data_world_ofs, 0.0, 1.0));
#ifdef USE_INSTANCING
model_matrix = model_matrix * transpose(mat4(instance_xform0, instance_xform1, vec4(0.0, 0.0, 1.0, 0.0), vec4(0.0, 0.0, 0.0, 1.0)));
#endif // USE_INSTANCING
vec2 color_texture_pixel_size = read_draw_data_color_texture_pixel_size;
#ifdef USE_POINT_SIZE
float point_size = 1.0;
#endif
#ifdef USE_WORLD_VERTEX_COORDS
vertex = (model_matrix * vec4(vertex, 0.0, 1.0)).xy;
#endif
{
#CODE : VERTEX
}
#ifdef USE_NINEPATCH
pixel_size_interp = abs(read_draw_data_dst_rect.zw) * vertex_base;
#endif
#if !defined(SKIP_TRANSFORM_USED) && !defined(USE_WORLD_VERTEX_COORDS)
vertex = (model_matrix * vec4(vertex, 0.0, 1.0)).xy;
#endif
color_interp = color;
vertex = (canvas_transform * vec4(vertex, 0.0, 1.0)).xy;
if (use_pixel_snap) {
vertex = floor(vertex + 0.5);
// precision issue on some hardware creates artifacts within texture
// offset uv by a small amount to avoid
uv += 1e-5;
}
vertex_interp = vertex;
uv_interp = uv;
gl_Position = screen_transform * vec4(vertex, 0.0, 1.0);
#ifdef USE_POINT_SIZE
gl_PointSize = point_size;
#endif
}
)<!>"
};
int test_main(int argc, char *argv[]) {
auto t0 = std::chrono::high_resolution_clock::now();
for (int i = 0; i < 100; ++i) {
String s = _vertex_code;
}
auto t1 = std::chrono::high_resolution_clock::now();
std::cout << std::chrono::duration_cast<std::chrono::microseconds>(t1 - t0).count() << "us\n";
} So, allocation through |
String
copy_from
and^ copy_from_unchecked` implementations
String
copy_from
and^ copy_from_unchecked` implementationsString::copy_from
and String::copy_from_unchecked
implementations
…making use of caller contracts and language spec (NULL termination and casts).
String::copy_from
and String::copy_from_unchecked
implementationsString::copy_from
and String::copy_from_unchecked
implementations, improving String allocation speed.
2d46321
to
e1c4239
Compare
Isn't it a little odd that the unchecked version does some unicode checks ? As a API user, I would assume the same behaviour from a copy_from if I pass an additional length. The one taking no length should probably just call the length one after a strlen, that would enforce consistency. |
I agree it is. However, that's outside the scope of this PR. Hoping to address that in a future PR since it's a behavorial change, while this one is a pure performance upgrades without any behavior changes.
For now the "length one" is actually slower than the one without length in most situations. This will change with the merge of #100132, but we're not there yet. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
Thanks! |
Summarizing my notes from #99775 (comment):
copy_from
,strlen()
has already established that noNULL
is contained in the string range. We can make use of this assumption by simply deleting the check for it.copy_from_unchecked
, where this assumption has already been documented above the function. It is a module-local function that's otherwise only used on substrings, where the contract of no-null is equally true (if dubious).char
touint8_t
will always reinterpret the bits in the same way that the manual check did in the current code (example). Clang is able to optimize this manual conversion away, but MSVC cannot. It is therefor optimizable.Everything put together, the
copy_from
loop is very likely to be SMID vectorized by the compiler now, improving its throughput by up to an estimated 8x.