Skip to content

Commit

Permalink
src: remove icu based ToASCII and ToUnicode
Browse files Browse the repository at this point in the history
PR-URL: #55156
Reviewed-By: James M Snell <[email protected]>
Reviewed-By: Matthew Aitken <[email protected]>
Reviewed-By: Daniel Lemire <[email protected]>
Reviewed-By: Richard Lau <[email protected]>
  • Loading branch information
anonrig authored Oct 18, 2024
1 parent 7c0cc12 commit 9f5000e
Show file tree
Hide file tree
Showing 4 changed files with 2 additions and 389 deletions.
172 changes: 2 additions & 170 deletions src/node_i18n.cc
Original file line number Diff line number Diff line change
Expand Up @@ -60,19 +60,17 @@
#include <unicode/uchar.h>
#include <unicode/uclean.h>
#include <unicode/ucnv.h>
#include <unicode/udata.h>
#include <unicode/uidna.h>
#include <unicode/ulocdata.h>
#include <unicode/urename.h>
#include <unicode/ustring.h>
#include <unicode/utf16.h>
#include <unicode/utf8.h>
#include <unicode/utypes.h>
#include <unicode/uvernum.h>
#include <unicode/uversion.h>
#include "nbytes.h"

#ifdef NODE_HAVE_SMALL_ICU
#include <unicode/udata.h>

/* if this is defined, we have a 'secondary' entry point.
compare following to utypes.h defs for U_ICUDATA_ENTRY_POINT */
#define SMALL_ICUDATA_ENTRY_POINT \
Expand All @@ -96,7 +94,6 @@ using v8::Int32;
using v8::Isolate;
using v8::Local;
using v8::MaybeLocal;
using v8::NewStringType;
using v8::Object;
using v8::ObjectTemplate;
using v8::String;
Expand Down Expand Up @@ -583,167 +580,6 @@ void SetDefaultTimeZone(const char* tzid) {
CHECK(U_SUCCESS(status));
}

int32_t ToUnicode(MaybeStackBuffer<char>* buf,
const char* input,
size_t length) {
UErrorCode status = U_ZERO_ERROR;
uint32_t options = UIDNA_NONTRANSITIONAL_TO_UNICODE;
UIDNA* uidna = uidna_openUTS46(options, &status);
if (U_FAILURE(status))
return -1;
UIDNAInfo info = UIDNA_INFO_INITIALIZER;

int32_t len = uidna_nameToUnicodeUTF8(uidna,
input, length,
**buf, buf->capacity(),
&info,
&status);

// Do not check info.errors like we do with ToASCII since ToUnicode always
// returns a string, despite any possible errors that may have occurred.

if (status == U_BUFFER_OVERFLOW_ERROR) {
status = U_ZERO_ERROR;
buf->AllocateSufficientStorage(len);
len = uidna_nameToUnicodeUTF8(uidna,
input, length,
**buf, buf->capacity(),
&info,
&status);
}

// info.errors is ignored as UTS #46 ToUnicode always produces a Unicode
// string, regardless of whether an error occurred.

if (U_FAILURE(status)) {
len = -1;
buf->SetLength(0);
} else {
buf->SetLength(len);
}

uidna_close(uidna);
return len;
}

int32_t ToASCII(MaybeStackBuffer<char>* buf,
const char* input,
size_t length,
idna_mode mode) {
UErrorCode status = U_ZERO_ERROR;
uint32_t options = // CheckHyphens = false; handled later
UIDNA_CHECK_BIDI | // CheckBidi = true
UIDNA_CHECK_CONTEXTJ | // CheckJoiners = true
UIDNA_NONTRANSITIONAL_TO_ASCII; // Nontransitional_Processing
if (mode == idna_mode::kStrict) {
options |= UIDNA_USE_STD3_RULES; // UseSTD3ASCIIRules = beStrict
// VerifyDnsLength = beStrict;
// handled later
}

UIDNA* uidna = uidna_openUTS46(options, &status);
if (U_FAILURE(status))
return -1;
UIDNAInfo info = UIDNA_INFO_INITIALIZER;

int32_t len = uidna_nameToASCII_UTF8(uidna,
input, length,
**buf, buf->capacity(),
&info,
&status);

if (status == U_BUFFER_OVERFLOW_ERROR) {
status = U_ZERO_ERROR;
buf->AllocateSufficientStorage(len);
len = uidna_nameToASCII_UTF8(uidna,
input, length,
**buf, buf->capacity(),
&info,
&status);
}

// In UTS #46 which specifies ToASCII, certain error conditions are
// configurable through options, and the WHATWG URL Standard promptly elects
// to disable some of them to accommodate for real-world use cases.
// Unfortunately, ICU4C's IDNA module does not support disabling some of
// these options through `options` above, and thus continues throwing
// unnecessary errors. To counter this situation, we just filter out the
// errors that may have happened afterwards, before deciding whether to
// return an error from this function.

// CheckHyphens = false
// (Specified in the current UTS #46 draft rev. 18.)
// Refs:
// - https://github.com/whatwg/url/issues/53
// - https://github.com/whatwg/url/pull/309
// - http://www.unicode.org/review/pri317/
// - http://www.unicode.org/reports/tr46/tr46-18.html
// - https://www.icann.org/news/announcement-2000-01-07-en
info.errors &= ~UIDNA_ERROR_HYPHEN_3_4;
info.errors &= ~UIDNA_ERROR_LEADING_HYPHEN;
info.errors &= ~UIDNA_ERROR_TRAILING_HYPHEN;

if (mode != idna_mode::kStrict) {
// VerifyDnsLength = beStrict
info.errors &= ~UIDNA_ERROR_EMPTY_LABEL;
info.errors &= ~UIDNA_ERROR_LABEL_TOO_LONG;
info.errors &= ~UIDNA_ERROR_DOMAIN_NAME_TOO_LONG;
}

if (U_FAILURE(status) || (mode != idna_mode::kLenient && info.errors != 0)) {
len = -1;
buf->SetLength(0);
} else {
buf->SetLength(len);
}

uidna_close(uidna);
return len;
}

static void ToUnicode(const FunctionCallbackInfo<Value>& args) {
Environment* env = Environment::GetCurrent(args);
CHECK_GE(args.Length(), 1);
CHECK(args[0]->IsString());
Utf8Value val(env->isolate(), args[0]);

MaybeStackBuffer<char> buf;
int32_t len = ToUnicode(&buf, *val, val.length());

if (len < 0) {
return THROW_ERR_INVALID_ARG_VALUE(env, "Cannot convert name to Unicode");
}

args.GetReturnValue().Set(
String::NewFromUtf8(env->isolate(),
*buf,
NewStringType::kNormal,
len).ToLocalChecked());
}

static void ToASCII(const FunctionCallbackInfo<Value>& args) {
Environment* env = Environment::GetCurrent(args);
CHECK_GE(args.Length(), 1);
CHECK(args[0]->IsString());
Utf8Value val(env->isolate(), args[0]);
// optional arg
bool lenient = args[1]->BooleanValue(env->isolate());
idna_mode mode = lenient ? idna_mode::kLenient : idna_mode::kDefault;

MaybeStackBuffer<char> buf;
int32_t len = ToASCII(&buf, *val, val.length(), mode);

if (len < 0) {
return THROW_ERR_INVALID_ARG_VALUE(env, "Cannot convert name to ASCII");
}

args.GetReturnValue().Set(
String::NewFromUtf8(env->isolate(),
*buf,
NewStringType::kNormal,
len).ToLocalChecked());
}

// This is similar to wcwidth except that it takes the current unicode
// character properties database into consideration, allowing it to
// correctly calculate the column widths of things like emoji's and
Expand Down Expand Up @@ -850,8 +686,6 @@ static void CreatePerIsolateProperties(IsolateData* isolate_data,
Local<ObjectTemplate> target) {
Isolate* isolate = isolate_data->isolate();

SetMethod(isolate, target, "toUnicode", ToUnicode);
SetMethod(isolate, target, "toASCII", ToASCII);
SetMethod(isolate, target, "getStringWidth", GetStringWidth);

// One-shot converters
Expand Down Expand Up @@ -880,8 +714,6 @@ void CreatePerContextProperties(Local<Object> target,
void* priv) {}

void RegisterExternalReferences(ExternalReferenceRegistry* registry) {
registry->Register(ToUnicode);
registry->Register(ToASCII);
registry->Register(GetStringWidth);
registry->Register(ICUErrorName);
registry->Register(Transcode);
Expand Down
13 changes: 0 additions & 13 deletions src/node_i18n.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,19 +53,6 @@ enum class idna_mode {
kStrict
};

// Implements the WHATWG URL Standard "domain to ASCII" algorithm.
// https://url.spec.whatwg.org/#concept-domain-to-ascii
int32_t ToASCII(MaybeStackBuffer<char>* buf,
const char* input,
size_t length,
idna_mode mode = idna_mode::kDefault);

// Implements the WHATWG URL Standard "domain to Unicode" algorithm.
// https://url.spec.whatwg.org/#concept-domain-to-unicode
int32_t ToUnicode(MaybeStackBuffer<char>* buf,
const char* input,
size_t length);

struct ConverterDeleter {
void operator()(UConverter* pointer) const { ucnv_close(pointer); }
};
Expand Down
149 changes: 0 additions & 149 deletions test/fixtures/icu-punycode-toascii.json

This file was deleted.

Loading

0 comments on commit 9f5000e

Please sign in to comment.