Skip to content

Commit

Permalink
feat: add similarity function
Browse files Browse the repository at this point in the history
  • Loading branch information
aleclarson committed Jul 23, 2024
1 parent dddd06f commit 20eb6f6
Show file tree
Hide file tree
Showing 5 changed files with 146 additions and 0 deletions.
11 changes: 11 additions & 0 deletions benchmarks/string/similarity.bench.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import * as _ from 'radashi'
import { bench } from 'vitest'

describe('similarity', () => {
const string1 = 'h'.repeat(100)
const string2 = 'ha'.repeat(50)

bench('with 50% similar characters', () => {
_.similarity(string1, string2)
})
})
16 changes: 16 additions & 0 deletions docs/string/similarity.mdx
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
---
title: similarity
description: Compare two strings and return a similarity score
---

### Usage

Does a thing. Returns a value.

```ts
import * as _ from 'radashi'

_.similarity()
```

https://en.wikipedia.org/wiki/Levenshtein_distance
1 change: 1 addition & 0 deletions src/mod.ts
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ export * from './string/camel.ts'
export * from './string/capitalize.ts'
export * from './string/dash.ts'
export * from './string/pascal.ts'
export * from './string/similarity.ts'
export * from './string/snake.ts'
export * from './string/template.ts'
export * from './string/title.ts'
Expand Down
85 changes: 85 additions & 0 deletions src/string/similarity.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
/**
* Calculate the similarity between two strings using the Levenshtein
* distance algorithm.
*
* One thing to note is that the argument order is unimportant. The
* algorithm will always return the same result regardless of the
* order of the arguments.
*
* Adapted from
* [@fabiospampinato/tiny-levenshtein](https://github.com/fabiospampinato/tiny-levenshtein)
* with ❤️.
*
* @see https://radashi-org.github.io/reference/string/similarity
* @example
* ```ts
* similarity('abc', 'abc') // 0
* similarity('a', 'b') // 1
* similarity('ab', 'ac') // 1
* similarity('ac', 'bc') // 1
* similarity('abc', 'axc') // 1
* similarity('kitten', 'sitting') // 3
* ```
*/
export function similarity(str1: string, str2: string): number {
// Early return if strings are identical
if (str1 === str2) {
return 0
}

// Find common prefix and suffix
let start = 0
let end1 = str1.length - 1
let end2 = str2.length - 1

while (start <= end1 && start <= end2 && str1[start] === str2[start]) {
start++
}

while (end1 >= start && end2 >= start && str1[end1] === str2[end2]) {
end1--
end2--
}

// Calculate lengths of trimmed strings
const length1 = end1 - start + 1
const length2 = end2 - start + 1

// Handle cases where one string is a substring of the other
if (length1 === 0) {
return length2
}
if (length2 === 0) {
return length1
}

const numRows = length1 + 1
const numColumns = length2 + 1

const distances = new Array<number>(numRows * numColumns).fill(0)

for (let x = 1; x < numColumns; x++) {
distances[x] = x
}
for (let y = 1; y < numRows; y++) {
distances[y * numColumns] = y
}

for (let x = 1; x < numColumns; x++) {
for (let y = 1; y < numRows; y++) {
const i = y * numColumns + x
distances[i] = Math.min(
// Cost of a deletion.
distances[i - numColumns] + 1,
// Cost of an insertion.
distances[i - 1] + 1,
// Cost of a substitution.
distances[i - numColumns - 1] +
(str1[start + y - 1] === str2[start + x - 1] ? 0 : 1),
)
}
}

// Return the Levenshtein distance
return distances[length1 * numColumns + length2]
}
33 changes: 33 additions & 0 deletions tests/string/similarity.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import * as _ from 'radashi'

describe('similarity', () => {
// https://github.com/fabiospampinato/tiny-levenshtein/blob/master/test/index.js
test('returns the distance between two strings', () => {
expect(_.similarity('abc', 'abc')).toBe(0)
expect(_.similarity('a', 'b')).toBe(1)
expect(_.similarity('ab', 'ac')).toBe(1)
expect(_.similarity('ac', 'bc')).toBe(1)
expect(_.similarity('abc', 'axc')).toBe(1)
expect(_.similarity('kitten', 'sitting')).toBe(3)
expect(_.similarity('xabxcdxxefxgx', '1ab2cd34ef5g6')).toBe(6)
expect(_.similarity('cat', 'cow')).toBe(2)
expect(_.similarity('xabxcdxxefxgx', 'abcdefg')).toBe(6)
expect(_.similarity('javawasneat', 'scalaisgreat')).toBe(7)
expect(_.similarity('example', 'samples')).toBe(3)
expect(_.similarity('sturgeon', 'urgently')).toBe(6)
expect(_.similarity('levenshtein', 'frankenstein')).toBe(6)
expect(_.similarity('distance', 'difference')).toBe(5)
expect(
_.similarity(
'因為我是中國人所以我會說中文',
'因為我是英國人所以我會說英文',
),
).toBe(2)
})
test('containment', () => {
expect(_.similarity('abababab', 'ab')).toBe(6)
expect(_.similarity('ab', 'abababab')).toBe(6)
expect(_.similarity('abc', 'ab')).toBe(1)
expect(_.similarity('ab', 'abc')).toBe(1)
})
})

0 comments on commit 20eb6f6

Please sign in to comment.