diff --git a/src/client/analytics.spec.ts b/src/client/analytics.spec.ts index b4c191f..6d5fea8 100644 --- a/src/client/analytics.spec.ts +++ b/src/client/analytics.spec.ts @@ -15,7 +15,6 @@ import {BrowserRuntime, NoopRuntime} from './runtimeEnvironment'; import * as doNotTrack from '../donottrack'; import {Cookie} from '../cookieutils'; import {CoveoLinkParam} from '../plugins/link'; -import {NoopAnalytics} from './noopAnalytics'; const aVisitorId = '123'; jest.mock('uuid', () => ({ @@ -281,7 +280,9 @@ describe('Analytics', () => { describe('should truncate the maxlength for URL parameters at 128 characters for ua events', () => { const desiredMax: number = 128; - const longUrl: string = 'http://coveo.com/?q=' + 'a'.repeat(desiredMax); + // Craft the URL so the truncation point falls in the %20 sequence + const longUrl: string = 'http://coveo.com/?q=' + 'a'.repeat(desiredMax - 22) + '%20b'; + const expectedTruncatedLength = longUrl.lastIndexOf('%', desiredMax); expect(longUrl.length).toBeGreaterThan(desiredMax); async function testEventType(type: EventType, url: string) { mockFetchRequestForEventType(type); @@ -290,8 +291,10 @@ describe('Analytics', () => { originLevel3: url, }); const [body] = getParsedBodyCalls(); - if (type == EventType.view) expect(body.location.length).toBeLessThanOrEqual(desiredMax); - expect(body.originLevel3.length).toBeLessThanOrEqual(desiredMax); + if (type == EventType.view) { + expect(body.location).toHaveLength(expectedTruncatedLength); + } + expect(body.originLevel3).toHaveLength(expectedTruncatedLength); } it('for view events', () => testEventType(EventType.view, longUrl)); it('for click events', () => testEventType(EventType.click, longUrl)); diff --git a/src/client/analytics.ts b/src/client/analytics.ts index 62f3973..98d2156 100644 --- a/src/client/analytics.ts +++ b/src/client/analytics.ts @@ -35,8 +35,7 @@ import HistoryStore from '../history'; import {isApiKey} from './token'; import {isReactNative, ReactNativeRuntimeWarning} from '../react-native/react-native-utils'; import {doNotTrack} from '../donottrack'; -import {NullStorage} from '../storage'; -import {isObject} from './utils'; +import {isObject, truncateUrl} from './utils'; export const Version = 'v15'; @@ -651,11 +650,8 @@ export class CoveoAnalyticsClient implements AnalyticsClient, VisitorIdProvider return rest; } - private limit(input: string, length: number): string | undefined | null { - if (typeof input !== 'string') { - return input; - } - return input.substring(0, length); + private limit(input: T, length: number): T { + return typeof input === 'string' ? (truncateUrl(input, length) as T) : input; } private get baseUrl(): string { diff --git a/src/client/utils.spec.ts b/src/client/utils.spec.ts new file mode 100644 index 0000000..bf3ef5d --- /dev/null +++ b/src/client/utils.spec.ts @@ -0,0 +1,113 @@ +import {truncateUrl} from './utils'; + +describe('utils', () => { + describe('truncateUrl', () => { + // Note: to obtain a UTF-8 encoded escape sequence, run encodeUriComponent('') in your browser console or NodeJS REPL + const URL_PLAIN = 'http://coveo.com/this/is/a/really/long/url/that/will/be/truncated?at=some&arbitrary=point'; + + it.each([[8], [16], [32], [64], [128]])( + `truncateUrl('${URL_PLAIN}', %d) truncates to exactly that length`, + (limit) => { + expect(truncateUrl(URL_PLAIN, limit)).toBe(URL_PLAIN.substring(0, limit)); + } + ); + + /** Decoded: `'http://test/ ¿OKツ😅#fine'` */ + const URL_WITH_ESCAPES = 'http://test/%20%C2%BFOK%E3%83%84%F0%9F%98%85#fine'; + // Number of bytes in code-point: <1>< 2 > < 3 >< 4 > + + it.each([[7], [22], [45], [46], [47], [48], [100]])( + `truncateUrl('${URL_WITH_ESCAPES}', %d) truncates to the exact limit outside of codepoints`, + (limit) => { + expect(truncateUrl(URL_WITH_ESCAPES, limit)).toBe(URL_WITH_ESCAPES.substring(0, limit)); + } + ); + + it.each([ + [12, 12], + [13, 12], + [14, 12], + [15, 15], + ])( + `truncateUrl('${URL_WITH_ESCAPES}', %d) does not break up single-byte codepoints`, + (limit, expectedLength) => { + expect(truncateUrl(URL_WITH_ESCAPES, limit)).toBe(URL_WITH_ESCAPES.substring(0, expectedLength)); + } + ); + + it.each([ + [15, 15], + [16, 15], + [17, 15], + [18, 15], + [19, 15], + [20, 15], + [21, 21], + ])(`truncateUrl('${URL_WITH_ESCAPES}', %d) does not break up two-byte codepoints`, (limit, expectedLength) => { + expect(truncateUrl(URL_WITH_ESCAPES, limit)).toBe(URL_WITH_ESCAPES.substring(0, expectedLength)); + }); + + it.each([ + [23, 23], + [24, 23], + [25, 23], + [26, 23], + [27, 23], + [28, 23], + [29, 23], + [30, 23], + [31, 23], + [32, 32], + ])( + `truncateUrl('${URL_WITH_ESCAPES}', %d) does not break up three-byte codepoints`, + (limit, expectedLength) => { + expect(truncateUrl(URL_WITH_ESCAPES, limit)).toBe(URL_WITH_ESCAPES.substring(0, expectedLength)); + } + ); + + it.each([ + [32, 32], + [33, 32], + [34, 32], + [35, 32], + [36, 32], + [37, 32], + [38, 32], + [39, 32], + [40, 32], + [41, 32], + [42, 32], + [43, 32], + [44, 44], + ])(`truncateUrl('${URL_WITH_ESCAPES}', %d) does not break up four-byte codepoints`, (limit, expectedLength) => { + expect(truncateUrl(URL_WITH_ESCAPES, limit)).toBe(URL_WITH_ESCAPES.substring(0, expectedLength)); + }); + + const URL_WITH_INVALID_ESCAPES = 'http://test/%this%is%so%invalid'; + + it.each([ + [12, 12], + [13, 12], + [14, 12], + [15, 15], + [16, 16], + [17, 17], + [18, 17], + [19, 17], + [20, 20], + [21, 20], + [22, 20], + [23, 23], + [24, 23], + [25, 23], + [26, 26], + ])( + `truncateUrl('${URL_WITH_INVALID_ESCAPES}', %d) only checks for percent with invalid escapes`, + (limit, expectedLength) => { + expect(truncateUrl(URL_WITH_INVALID_ESCAPES, limit)).toBe( + URL_WITH_INVALID_ESCAPES.substring(0, expectedLength) + ); + } + ); + }); +}); diff --git a/src/client/utils.ts b/src/client/utils.ts index ee787dc..bba9630 100644 --- a/src/client/utils.ts +++ b/src/client/utils.ts @@ -15,3 +15,68 @@ export function isObject(o: any): boolean { export function coerceToNumber(input: any): any { return typeof input === 'string' && input != '' && !Number.isNaN(+input) ? +input : input; } + +/** + * Can be used to both check if the first bit is set (for any utf-8 multibyte part), + * and to detect "following bytes" (which start with `10xx_xxxx`) + */ +const UTF8_HIGH_BIT = 0b1000_0000; +/** Header for a 2-byte code point: `110x_xxxx`. Can also be used as bit-mask to check for "following bytes". */ +const UTF8_HEADER_2 = 0b1100_0000; +/** Header for a 3-byte code point: `1110_xxxx`. Can also be used as bit-mask to check for "header 2". */ +const UTF8_HEADER_3 = 0b1110_0000; +/** Header for a 4-byte code point: `1111_0xxx`. Can also be used as bit-mask to check for "header 3". */ +const UTF8_HEADER_4 = 0b1111_0000; + +function utf8ByteCountFromFirstByte(firstByte: number): number { + if ((firstByte & 0b1111_1000) === UTF8_HEADER_4) { + return 4; + } + if ((firstByte & UTF8_HEADER_4) === UTF8_HEADER_3) { + return 3; + } + if ((firstByte & UTF8_HEADER_3) === UTF8_HEADER_2) { + return 2; + } + return 1; +} + +/** + * Truncate a URL to an arbitrary length, taking care to not break inside a percent escape or UTF-8 multibyte sequence. + * + * @param input The input to truncate to the specified limit. + * @param limit The limit to apply; if negative, no truncation is applied. + * @returns The URL, possibly truncated to a length near limit (at most 11 characters less than limit). + */ +export function truncateUrl(input: string, limit: number): string { + if (limit < 0 || input.length <= limit) { + return input; + } + // A valid escape sequence is a percent followed by 2 hexadecimal characters; check if we split one up. + let end = input.indexOf('%', limit - 2); + if (end < 0 || end > limit) { + end = limit; + } else { + limit = end; + } + // Check that truncating at end won't break up an UTF-8 multibyte sequence half-way, + // by peeking backwards to find the first byte of an UTF-8 sequence (if present). + while (end > 2 && input.charAt(end - 3) == '%') { + const peekByte = Number.parseInt(input.substring(end - 2, end), 16); + // Note: if parsing fails, NaN gets coerced to 0 by the bitwise and. + if ((peekByte & UTF8_HIGH_BIT) != UTF8_HIGH_BIT) { + break; + } + end -= 3; + // Check if we reached the first byte by checking it is not a "follow byte": 10xx_xxxx. + if ((peekByte & UTF8_HEADER_2) != UTF8_HIGH_BIT) { + // If the full code point is there, keep it. + if (limit - end >= utf8ByteCountFromFirstByte(peekByte) * 3) { + end = limit; + } + // Otherwise, end is already set at the correct point to truncate at (the start of the multibyte sequence). + break; + } + } + return input.substring(0, end); +}