feat: add similarity function

radashi-org · Jul 23, 2024 · 20eb6f6 · 20eb6f6
1 parent dddd06f
commit 20eb6f6
Show file tree

Hide file tree

Showing 5 changed files with 146 additions and 0 deletions.
diff --git a/benchmarks/string/similarity.bench.ts b/benchmarks/string/similarity.bench.ts
@@ -0,0 +1,11 @@
+import * as _ from 'radashi'
+import { bench } from 'vitest'
+
+describe('similarity', () => {
+  const string1 = 'h'.repeat(100)
+  const string2 = 'ha'.repeat(50)
+
+  bench('with 50% similar characters', () => {
+    _.similarity(string1, string2)
+  })
+})
diff --git a/docs/string/similarity.mdx b/docs/string/similarity.mdx
@@ -0,0 +1,16 @@
+---
+title: similarity
+description: Compare two strings and return a similarity score
+---
+
+### Usage
+
+Does a thing. Returns a value.
+
+```ts
+import * as _ from 'radashi'
+
+_.similarity()
+```
+
+https://en.wikipedia.org/wiki/Levenshtein_distance
diff --git a/src/mod.ts b/src/mod.ts
@@ -100,6 +100,7 @@ export * from './string/camel.ts'
 export * from './string/capitalize.ts'
 export * from './string/dash.ts'
 export * from './string/pascal.ts'
+export * from './string/similarity.ts'
 export * from './string/snake.ts'
 export * from './string/template.ts'
 export * from './string/title.ts'

diff --git a/src/string/similarity.ts b/src/string/similarity.ts
@@ -0,0 +1,85 @@
+/**
+ * Calculate the similarity between two strings using the Levenshtein
+ * distance algorithm.
+ *
+ * One thing to note is that the argument order is unimportant. The
+ * algorithm will always return the same result regardless of the
+ * order of the arguments.
+ *
+ * Adapted from
+ * [@fabiospampinato/tiny-levenshtein](https://github.com/fabiospampinato/tiny-levenshtein)
+ * with ❤️.
+ *
+ * @see https://radashi-org.github.io/reference/string/similarity
+ * @example
+ * ```ts
+ * similarity('abc', 'abc') // 0
+ * similarity('a', 'b') // 1
+ * similarity('ab', 'ac') // 1
+ * similarity('ac', 'bc') // 1
+ * similarity('abc', 'axc') // 1
+ * similarity('kitten', 'sitting') // 3
+ * ```
+ */
+export function similarity(str1: string, str2: string): number {
+  // Early return if strings are identical
+  if (str1 === str2) {
+    return 0
+  }
+
+  // Find common prefix and suffix
+  let start = 0
+  let end1 = str1.length - 1
+  let end2 = str2.length - 1
+
+  while (start <= end1 && start <= end2 && str1[start] === str2[start]) {
+    start++
+  }
+
+  while (end1 >= start && end2 >= start && str1[end1] === str2[end2]) {
+    end1--
+    end2--
+  }
+
+  // Calculate lengths of trimmed strings
+  const length1 = end1 - start + 1
+  const length2 = end2 - start + 1
+
+  // Handle cases where one string is a substring of the other
+  if (length1 === 0) {
+    return length2
+  }
+  if (length2 === 0) {
+    return length1
+  }
+
+  const numRows = length1 + 1
+  const numColumns = length2 + 1
+
+  const distances = new Array<number>(numRows * numColumns).fill(0)
+
+  for (let x = 1; x < numColumns; x++) {
+    distances[x] = x
+  }
+  for (let y = 1; y < numRows; y++) {
+    distances[y * numColumns] = y
+  }
+
+  for (let x = 1; x < numColumns; x++) {
+    for (let y = 1; y < numRows; y++) {
+      const i = y * numColumns + x
+      distances[i] = Math.min(
+        // Cost of a deletion.
+        distances[i - numColumns] + 1,
+        // Cost of an insertion.
+        distances[i - 1] + 1,
+        // Cost of a substitution.
+        distances[i - numColumns - 1] +
+          (str1[start + y - 1] === str2[start + x - 1] ? 0 : 1),
+      )
+    }
+  }
+
+  // Return the Levenshtein distance
+  return distances[length1 * numColumns + length2]
+}
diff --git a/tests/string/similarity.test.ts b/tests/string/similarity.test.ts
@@ -0,0 +1,33 @@
+import * as _ from 'radashi'
+
+describe('similarity', () => {
+  // https://github.com/fabiospampinato/tiny-levenshtein/blob/master/test/index.js
+  test('returns the distance between two strings', () => {
+    expect(_.similarity('abc', 'abc')).toBe(0)
+    expect(_.similarity('a', 'b')).toBe(1)
+    expect(_.similarity('ab', 'ac')).toBe(1)
+    expect(_.similarity('ac', 'bc')).toBe(1)
+    expect(_.similarity('abc', 'axc')).toBe(1)
+    expect(_.similarity('kitten', 'sitting')).toBe(3)
+    expect(_.similarity('xabxcdxxefxgx', '1ab2cd34ef5g6')).toBe(6)
+    expect(_.similarity('cat', 'cow')).toBe(2)
+    expect(_.similarity('xabxcdxxefxgx', 'abcdefg')).toBe(6)
+    expect(_.similarity('javawasneat', 'scalaisgreat')).toBe(7)
+    expect(_.similarity('example', 'samples')).toBe(3)
+    expect(_.similarity('sturgeon', 'urgently')).toBe(6)
+    expect(_.similarity('levenshtein', 'frankenstein')).toBe(6)
+    expect(_.similarity('distance', 'difference')).toBe(5)
+    expect(
+      _.similarity(
+        '因為我是中國人所以我會說中文',
+        '因為我是英國人所以我會說英文',
+      ),
+    ).toBe(2)
+  })
+  test('containment', () => {
+    expect(_.similarity('abababab', 'ab')).toBe(6)
+    expect(_.similarity('ab', 'abababab')).toBe(6)
+    expect(_.similarity('abc', 'ab')).toBe(1)
+    expect(_.similarity('ab', 'abc')).toBe(1)
+  })
+})