Skip to content
70 changes: 52 additions & 18 deletions packages/long-memory/src/similarity.ts
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
import { cut } from 'jieba-wasm'
import TinySegmenter from 'tiny-segmenter'

Check warning on line 2 in packages/long-memory/src/similarity.ts

View workflow job for this annotation

GitHub Actions / lint

Import name `TinySegmenter` must match one of the following formats: camelCase, UPPER_CASE
import stopwords from 'stopwords-iso'

const segmenter = new TinySegmenter()

const SIMILARITY_WEIGHTS = {
cosine: 0.3,
levenshtein: 0.2,
jaccard: 0.2,
bm25: 0.3
cosine: 0.35,
levenshtein: 0.05,
jaccard: 0.1,
bm25: 0.5
} as const

function validateAndAdjustWeights(weights: typeof SIMILARITY_WEIGHTS) {
Expand Down Expand Up @@ -185,23 +185,26 @@
return dotProduct / (magnitude1 * magnitude2)
}

private static calculateBM25Similarity(s1: string, s2: string): number {
const k1 = 1.5 // 词频饱和参数
const b = 0.75 // 文档长度归一化参数
const epsilon = 0.25 // 平滑因子

const tokens1 = TextTokenizer.tokenize(s1)
const tokens2 = TextTokenizer.tokenize(s2)

if (tokens1.length === 0 || tokens2.length === 0) {
return 0
}

const doc1Length = tokens1.length
const doc2Length = tokens2.length
const avgDocLength = (doc1Length + doc2Length) / 2

const termFreqDoc1 = new Map<string, number>()
const termFreqDoc2 = new Map<string, number>()

const uniqueTerms = new Set([...tokens1, ...tokens2])

Check notice on line 207 in packages/long-memory/src/similarity.ts

View check run for this annotation

codefactor.io / CodeFactor

packages/long-memory/src/similarity.ts#L188-L207

Complex Method
tokens1.forEach((token) => {
termFreqDoc1.set(token, (termFreqDoc1.get(token) || 0) + 1)
})
Expand All @@ -210,25 +213,56 @@
termFreqDoc2.set(token, (termFreqDoc2.get(token) || 0) + 1)
})

let score = 0
// 计算双向 BM25 得分
let score1to2 = 0
let score2to1 = 0
let maxScore1to2 = 0
let maxScore2to1 = 0

// 计算每个词条的 BM25 得分
for (const term of uniqueTerms) {
const tf = termFreqDoc1.get(term) || 0
const docFreq = (termFreqDoc2.get(term) || 0) > 0 ? 1 : 0

const idf = Math.log((2 - docFreq + epsilon) / (docFreq + epsilon))
// 计算 doc1 -> doc2 的方向
const tf1 = termFreqDoc1.get(term) || 0
const docFreq1 = (termFreqDoc2.get(term) || 0) > 0 ? 1 : 0
if (tf1 > 0) {
const idf1 = Math.log(
(2 - docFreq1 + epsilon) / (docFreq1 + epsilon) + 1
)
const numerator1 = tf1 * (k1 + 1)
const denominator1 =
tf1 + k1 * (1 - b + b * (doc1Length / avgDocLength))
score1to2 += idf1 * (numerator1 / denominator1)

const maxTf1 = Math.max(tf1, termFreqDoc2.get(term) || 0)
const maxNumerator1 = maxTf1 * (k1 + 1)
const maxDenominator1 =
maxTf1 + k1 * (1 - b + b * (doc1Length / avgDocLength))
maxScore1to2 += idf1 * (maxNumerator1 / maxDenominator1)
}

if (tf > 0) {
const numerator = tf * (k1 + 1)
const denominator =
tf + k1 * (1 - b + b * (doc2Length / avgDocLength))
score += idf * (numerator / denominator)
// 计算 doc2 -> doc1 的方向
const tf2 = termFreqDoc2.get(term) || 0
const docFreq2 = (termFreqDoc1.get(term) || 0) > 0 ? 1 : 0
if (tf2 > 0) {
const idf2 = Math.log(
(2 - docFreq2 + epsilon) / (docFreq2 + epsilon) + 1
)
const numerator2 = tf2 * (k1 + 1)
const denominator2 =
tf2 + k1 * (1 - b + b * (doc2Length / avgDocLength))
score2to1 += idf2 * (numerator2 / denominator2)

const maxTf2 = Math.max(tf2, termFreqDoc1.get(term) || 0)
const maxNumerator2 = maxTf2 * (k1 + 1)
const maxDenominator2 =
maxTf2 + k1 * (1 - b + b * (doc2Length / avgDocLength))
maxScore2to1 += idf2 * (maxNumerator2 / maxDenominator2)
}
}

const maxPossibleScore = Math.log(2) * doc1Length // 理论最大得分
return score / maxPossibleScore
const normalizedScore1 = maxScore1to2 > 0 ? score1to2 / maxScore1to2 : 0
const normalizedScore2 = maxScore2to1 > 0 ? score2to1 / maxScore2to1 : 0

return (normalizedScore1 + normalizedScore2) / 2
}

public static calculate(str1: string, str2: string): SimilarityResult {
Expand Down
Loading