Skip to content

Commit

Permalink
fix: UA-9204 Safer URL truncation (#471)
Browse files Browse the repository at this point in the history
  • Loading branch information
rphilippen authored Aug 19, 2024
1 parent b97526b commit 09fa118
Show file tree
Hide file tree
Showing 4 changed files with 188 additions and 11 deletions.
11 changes: 7 additions & 4 deletions src/client/analytics.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ import {BrowserRuntime, NoopRuntime} from './runtimeEnvironment';
import * as doNotTrack from '../donottrack';
import {Cookie} from '../cookieutils';
import {CoveoLinkParam} from '../plugins/link';
import {NoopAnalytics} from './noopAnalytics';

const aVisitorId = '123';
jest.mock('uuid', () => ({
Expand Down Expand Up @@ -281,7 +280,9 @@ describe('Analytics', () => {

describe('should truncate the maxlength for URL parameters at 128 characters for ua events', () => {
const desiredMax: number = 128;
const longUrl: string = 'http://coveo.com/?q=' + 'a'.repeat(desiredMax);
// Craft the URL so the truncation point falls in the %20 sequence
const longUrl: string = 'http://coveo.com/?q=' + 'a'.repeat(desiredMax - 22) + '%20b';
const expectedTruncatedLength = longUrl.lastIndexOf('%', desiredMax);
expect(longUrl.length).toBeGreaterThan(desiredMax);
async function testEventType(type: EventType, url: string) {
mockFetchRequestForEventType(type);
Expand All @@ -290,8 +291,10 @@ describe('Analytics', () => {
originLevel3: url,
});
const [body] = getParsedBodyCalls();
if (type == EventType.view) expect(body.location.length).toBeLessThanOrEqual(desiredMax);
expect(body.originLevel3.length).toBeLessThanOrEqual(desiredMax);
if (type == EventType.view) {
expect(body.location).toHaveLength(expectedTruncatedLength);
}
expect(body.originLevel3).toHaveLength(expectedTruncatedLength);
}
it('for view events', () => testEventType(EventType.view, longUrl));
it('for click events', () => testEventType(EventType.click, longUrl));
Expand Down
10 changes: 3 additions & 7 deletions src/client/analytics.ts
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,7 @@ import HistoryStore from '../history';
import {isApiKey} from './token';
import {isReactNative, ReactNativeRuntimeWarning} from '../react-native/react-native-utils';
import {doNotTrack} from '../donottrack';
import {NullStorage} from '../storage';
import {isObject} from './utils';
import {isObject, truncateUrl} from './utils';

export const Version = 'v15';

Expand Down Expand Up @@ -651,11 +650,8 @@ export class CoveoAnalyticsClient implements AnalyticsClient, VisitorIdProvider
return rest;
}

private limit(input: string, length: number): string | undefined | null {
if (typeof input !== 'string') {
return input;
}
return input.substring(0, length);
private limit<T>(input: T, length: number): T {
return typeof input === 'string' ? (truncateUrl(input, length) as T) : input;
}

private get baseUrl(): string {
Expand Down
113 changes: 113 additions & 0 deletions src/client/utils.spec.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
import {truncateUrl} from './utils';

describe('utils', () => {
describe('truncateUrl', () => {
// Note: to obtain a UTF-8 encoded escape sequence, run encodeUriComponent('<value>') in your browser console or NodeJS REPL
const URL_PLAIN = 'http://coveo.com/this/is/a/really/long/url/that/will/be/truncated?at=some&arbitrary=point';

it.each([[8], [16], [32], [64], [128]])(
`truncateUrl('${URL_PLAIN}', %d) truncates to exactly that length`,
(limit) => {
expect(truncateUrl(URL_PLAIN, limit)).toBe(URL_PLAIN.substring(0, limit));
}
);

/** Decoded: `'http://test/ ¿OKツ😅#fine'` */
const URL_WITH_ESCAPES = 'http://test/%20%C2%BFOK%E3%83%84%F0%9F%98%85#fine';
// Number of bytes in code-point: <1>< 2 > < 3 >< 4 >

it.each([[7], [22], [45], [46], [47], [48], [100]])(
`truncateUrl('${URL_WITH_ESCAPES}', %d) truncates to the exact limit outside of codepoints`,
(limit) => {
expect(truncateUrl(URL_WITH_ESCAPES, limit)).toBe(URL_WITH_ESCAPES.substring(0, limit));
}
);

it.each([
[12, 12],
[13, 12],
[14, 12],
[15, 15],
])(
`truncateUrl('${URL_WITH_ESCAPES}', %d) does not break up single-byte codepoints`,
(limit, expectedLength) => {
expect(truncateUrl(URL_WITH_ESCAPES, limit)).toBe(URL_WITH_ESCAPES.substring(0, expectedLength));
}
);

it.each([
[15, 15],
[16, 15],
[17, 15],
[18, 15],
[19, 15],
[20, 15],
[21, 21],
])(`truncateUrl('${URL_WITH_ESCAPES}', %d) does not break up two-byte codepoints`, (limit, expectedLength) => {
expect(truncateUrl(URL_WITH_ESCAPES, limit)).toBe(URL_WITH_ESCAPES.substring(0, expectedLength));
});

it.each([
[23, 23],
[24, 23],
[25, 23],
[26, 23],
[27, 23],
[28, 23],
[29, 23],
[30, 23],
[31, 23],
[32, 32],
])(
`truncateUrl('${URL_WITH_ESCAPES}', %d) does not break up three-byte codepoints`,
(limit, expectedLength) => {
expect(truncateUrl(URL_WITH_ESCAPES, limit)).toBe(URL_WITH_ESCAPES.substring(0, expectedLength));
}
);

it.each([
[32, 32],
[33, 32],
[34, 32],
[35, 32],
[36, 32],
[37, 32],
[38, 32],
[39, 32],
[40, 32],
[41, 32],
[42, 32],
[43, 32],
[44, 44],
])(`truncateUrl('${URL_WITH_ESCAPES}', %d) does not break up four-byte codepoints`, (limit, expectedLength) => {
expect(truncateUrl(URL_WITH_ESCAPES, limit)).toBe(URL_WITH_ESCAPES.substring(0, expectedLength));
});

const URL_WITH_INVALID_ESCAPES = 'http://test/%this%is%so%invalid';

it.each([
[12, 12],
[13, 12],
[14, 12],
[15, 15],
[16, 16],
[17, 17],
[18, 17],
[19, 17],
[20, 20],
[21, 20],
[22, 20],
[23, 23],
[24, 23],
[25, 23],
[26, 26],
])(
`truncateUrl('${URL_WITH_INVALID_ESCAPES}', %d) only checks for percent with invalid escapes`,
(limit, expectedLength) => {
expect(truncateUrl(URL_WITH_INVALID_ESCAPES, limit)).toBe(
URL_WITH_INVALID_ESCAPES.substring(0, expectedLength)
);
}
);
});
});
65 changes: 65 additions & 0 deletions src/client/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,68 @@ export function isObject(o: any): boolean {
export function coerceToNumber(input: any): any {
return typeof input === 'string' && input != '' && !Number.isNaN(+input) ? +input : input;
}

/**
* Can be used to both check if the first bit is set (for any utf-8 multibyte part),
* and to detect "following bytes" (which start with `10xx_xxxx`)
*/
const UTF8_HIGH_BIT = 0b1000_0000;
/** Header for a 2-byte code point: `110x_xxxx`. Can also be used as bit-mask to check for "following bytes". */
const UTF8_HEADER_2 = 0b1100_0000;
/** Header for a 3-byte code point: `1110_xxxx`. Can also be used as bit-mask to check for "header 2". */
const UTF8_HEADER_3 = 0b1110_0000;
/** Header for a 4-byte code point: `1111_0xxx`. Can also be used as bit-mask to check for "header 3". */
const UTF8_HEADER_4 = 0b1111_0000;

function utf8ByteCountFromFirstByte(firstByte: number): number {
if ((firstByte & 0b1111_1000) === UTF8_HEADER_4) {
return 4;
}
if ((firstByte & UTF8_HEADER_4) === UTF8_HEADER_3) {
return 3;
}
if ((firstByte & UTF8_HEADER_3) === UTF8_HEADER_2) {
return 2;
}
return 1;
}

/**
* Truncate a URL to an arbitrary length, taking care to not break inside a percent escape or UTF-8 multibyte sequence.
*
* @param input The input to truncate to the specified limit.
* @param limit The limit to apply; if negative, no truncation is applied.
* @returns The URL, possibly truncated to a length near limit (at most 11 characters less than limit).
*/
export function truncateUrl(input: string, limit: number): string {
if (limit < 0 || input.length <= limit) {
return input;
}
// A valid escape sequence is a percent followed by 2 hexadecimal characters; check if we split one up.
let end = input.indexOf('%', limit - 2);
if (end < 0 || end > limit) {
end = limit;
} else {
limit = end;
}
// Check that truncating at end won't break up an UTF-8 multibyte sequence half-way,
// by peeking backwards to find the first byte of an UTF-8 sequence (if present).
while (end > 2 && input.charAt(end - 3) == '%') {
const peekByte = Number.parseInt(input.substring(end - 2, end), 16);
// Note: if parsing fails, NaN gets coerced to 0 by the bitwise and.
if ((peekByte & UTF8_HIGH_BIT) != UTF8_HIGH_BIT) {
break;
}
end -= 3;
// Check if we reached the first byte by checking it is not a "follow byte": 10xx_xxxx.
if ((peekByte & UTF8_HEADER_2) != UTF8_HIGH_BIT) {
// If the full code point is there, keep it.
if (limit - end >= utf8ByteCountFromFirstByte(peekByte) * 3) {
end = limit;
}
// Otherwise, end is already set at the correct point to truncate at (the start of the multibyte sequence).
break;
}
}
return input.substring(0, end);
}

0 comments on commit 09fa118

Please sign in to comment.