diff --git a/benchmarks/string/similarity.bench.ts b/benchmarks/string/similarity.bench.ts new file mode 100644 index 00000000..280cb5e7 --- /dev/null +++ b/benchmarks/string/similarity.bench.ts @@ -0,0 +1,11 @@ +import * as _ from 'radashi' +import { bench } from 'vitest' + +describe('similarity', () => { + const string1 = 'h'.repeat(100) + const string2 = 'ha'.repeat(50) + + bench('with 50% similar characters', () => { + _.similarity(string1, string2) + }) +}) diff --git a/docs/string/similarity.mdx b/docs/string/similarity.mdx new file mode 100644 index 00000000..8a3c8860 --- /dev/null +++ b/docs/string/similarity.mdx @@ -0,0 +1,16 @@ +--- +title: similarity +description: Compare two strings and return a similarity score +--- + +### Usage + +Does a thing. Returns a value. + +```ts +import * as _ from 'radashi' + +_.similarity() +``` + +https://en.wikipedia.org/wiki/Levenshtein_distance diff --git a/src/mod.ts b/src/mod.ts index 426f63a5..ae47f9bf 100644 --- a/src/mod.ts +++ b/src/mod.ts @@ -100,6 +100,7 @@ export * from './string/camel.ts' export * from './string/capitalize.ts' export * from './string/dash.ts' export * from './string/pascal.ts' +export * from './string/similarity.ts' export * from './string/snake.ts' export * from './string/template.ts' export * from './string/title.ts' diff --git a/src/string/similarity.ts b/src/string/similarity.ts new file mode 100644 index 00000000..7d5facf8 --- /dev/null +++ b/src/string/similarity.ts @@ -0,0 +1,85 @@ +/** + * Calculate the similarity between two strings using the Levenshtein + * distance algorithm. + * + * One thing to note is that the argument order is unimportant. The + * algorithm will always return the same result regardless of the + * order of the arguments. + * + * Adapted from + * [@fabiospampinato/tiny-levenshtein](https://github.com/fabiospampinato/tiny-levenshtein) + * with ❤️. + * + * @see https://radashi-org.github.io/reference/string/similarity + * @example + * ```ts + * similarity('abc', 'abc') // 0 + * similarity('a', 'b') // 1 + * similarity('ab', 'ac') // 1 + * similarity('ac', 'bc') // 1 + * similarity('abc', 'axc') // 1 + * similarity('kitten', 'sitting') // 3 + * ``` + */ +export function similarity(str1: string, str2: string): number { + // Early return if strings are identical + if (str1 === str2) { + return 0 + } + + // Find common prefix and suffix + let start = 0 + let end1 = str1.length - 1 + let end2 = str2.length - 1 + + while (start <= end1 && start <= end2 && str1[start] === str2[start]) { + start++ + } + + while (end1 >= start && end2 >= start && str1[end1] === str2[end2]) { + end1-- + end2-- + } + + // Calculate lengths of trimmed strings + const length1 = end1 - start + 1 + const length2 = end2 - start + 1 + + // Handle cases where one string is a substring of the other + if (length1 === 0) { + return length2 + } + if (length2 === 0) { + return length1 + } + + const numRows = length1 + 1 + const numColumns = length2 + 1 + + const distances = new Array(numRows * numColumns).fill(0) + + for (let x = 1; x < numColumns; x++) { + distances[x] = x + } + for (let y = 1; y < numRows; y++) { + distances[y * numColumns] = y + } + + for (let x = 1; x < numColumns; x++) { + for (let y = 1; y < numRows; y++) { + const i = y * numColumns + x + distances[i] = Math.min( + // Cost of a deletion. + distances[i - numColumns] + 1, + // Cost of an insertion. + distances[i - 1] + 1, + // Cost of a substitution. + distances[i - numColumns - 1] + + (str1[start + y - 1] === str2[start + x - 1] ? 0 : 1), + ) + } + } + + // Return the Levenshtein distance + return distances[length1 * numColumns + length2] +} diff --git a/tests/string/similarity.test.ts b/tests/string/similarity.test.ts new file mode 100644 index 00000000..ebb6f3a9 --- /dev/null +++ b/tests/string/similarity.test.ts @@ -0,0 +1,33 @@ +import * as _ from 'radashi' + +describe('similarity', () => { + // https://github.com/fabiospampinato/tiny-levenshtein/blob/master/test/index.js + test('returns the distance between two strings', () => { + expect(_.similarity('abc', 'abc')).toBe(0) + expect(_.similarity('a', 'b')).toBe(1) + expect(_.similarity('ab', 'ac')).toBe(1) + expect(_.similarity('ac', 'bc')).toBe(1) + expect(_.similarity('abc', 'axc')).toBe(1) + expect(_.similarity('kitten', 'sitting')).toBe(3) + expect(_.similarity('xabxcdxxefxgx', '1ab2cd34ef5g6')).toBe(6) + expect(_.similarity('cat', 'cow')).toBe(2) + expect(_.similarity('xabxcdxxefxgx', 'abcdefg')).toBe(6) + expect(_.similarity('javawasneat', 'scalaisgreat')).toBe(7) + expect(_.similarity('example', 'samples')).toBe(3) + expect(_.similarity('sturgeon', 'urgently')).toBe(6) + expect(_.similarity('levenshtein', 'frankenstein')).toBe(6) + expect(_.similarity('distance', 'difference')).toBe(5) + expect( + _.similarity( + '因為我是中國人所以我會說中文', + '因為我是英國人所以我會說英文', + ), + ).toBe(2) + }) + test('containment', () => { + expect(_.similarity('abababab', 'ab')).toBe(6) + expect(_.similarity('ab', 'abababab')).toBe(6) + expect(_.similarity('abc', 'ab')).toBe(1) + expect(_.similarity('ab', 'abc')).toBe(1) + }) +})