-
Notifications
You must be signed in to change notification settings - Fork 1.5k
/
Bitap.cs
243 lines (217 loc) · 9.7 KB
/
Bitap.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
using System;
namespace Algorithms.Strings.PatternMatching;
/// <summary>
/// The Bitap algorithm is a fuzzy string matching technique. It ains to find approximate matches of a pattern within a
/// text, allowing for a certain degree of mismatch (e.g., mistypes, minor variations etc.). It's knowd for its efficiency,
/// using bitwise operations for fast comparisons.
///
/// <para>
/// <b>How it works:</b>
/// <list type="number">
/// <item>
/// <term>Initialization</term>
/// <description>
/// Bitmasks are created for each character in the pattern. These bitmasks are essentially binary numbers where each bit
/// represents a specific character's position within the pattern. An initial state variable <c>R</c> is set to all 1s,
/// indicating that all characters in the pattern are initially unmatched.
/// </description>
/// </item>
/// <item>
/// <term>Iteration</term>
/// <description>
/// The algorithm iterates through each character in the text. For each character, the state <c>R</c> is updated using
/// bitwise operations (shifts and logical ORs). This update reflects whether the current character in the text matches
/// the corresponding character in the pattern.
/// </description>
/// </item>
/// <item>
/// <term>Matching</term>
/// <description>
/// After each iteration, the algorithm checks if the least significant bit of <c>R</c> is set to 1.
/// If it is, it means there's a potential match at that position, with a mismatch distance that's within the allowed
/// threshold.
/// </description>
/// </item>
/// </list>
/// </para>
/// <para>
/// <b> Finding Matches </b>
/// </para>
/// <para>
/// If the least significant bit of <c>R</c> is 1, it means a potential match is found.
/// The number of leading zeros in <c>R</c> indicates the mismatch distance.
/// If this distance is within the allowed threshold, it's considered a valid match.
/// </para>
/// </summary>
public static class Bitap
{
/// <summary>
/// <para>
/// This function implements the Bitap algorithm for finding exact matches of a pattern within a text.
/// It aims to find the first occurrence of the pattern in the text, allowing for no mismatches.
/// </para>
/// <para>
/// The algorithm iterates through each character in the text. For each character, the state <c>R</c> is updated using
/// bitwise operations (shifts and logical ORs). This update reflects whether the current character in the text matches
/// the corresponding character in the pattern.
/// </para>
/// <para>
/// After each iteration, the algorithm checks if the least significant bit of <c>R</c> is set to 1.
/// If it is, it means there's a potential match at that position, with a mismatch distance of 0.
/// The function returns the index of the first occurrence of the pattern in the text, or -1 if not found.
/// </para>
/// <para>
/// The function throws an <see cref="ArgumentException"/> if the pattern is longer than 31 characters.
/// This is because the maximum length of the pattern is 31, because if it's longer than that,
/// we won't be able to represent the pattern mask in an int.
/// </para>
/// </summary>
/// <param name="text">The text to search in.</param>
/// <param name="pattern">The pattern to search for.</param>
/// <returns>The index of the first occurrence of the pattern in the text, or -1 if not found.</returns>
/// <exception cref="ArgumentException">The pattern is longer than 31 characters.</exception>
public static int FindExactPattern(string text, string pattern)
{
// The length of the pattern.
var len = pattern.Length;
// An array of integers that will be used to mask the pattern.
// The pattern mask is a bitmask that we will use to search for the pattern characters
// in the text. We'll set the bit corresponding to the character in the pattern
// to 0, and then use bitwise operations to check for the pattern.
var patternMask = new int[128];
int index;
// Check if the pattern is empty.
if (string.IsNullOrEmpty(pattern))
{
return 0;
}
// Check if the pattern is longer than 31 characters.
if (len > 31)
{
throw new ArgumentException("The pattern is longer than 31 characters.");
}
// Initialize the register <c>R</c> to all 1s.
var r = ~1;
// Initialize the pattern mask to all 1s.
for (index = 0; index <= 127; ++index)
{
patternMask[index] = ~0;
}
// Set the bits corresponding to the characters in the pattern to 0 in the pattern mask.
for (index = 0; index < len; ++index)
{
patternMask[pattern[index]] &= ~(1 << index);
}
// Iterate through each character in the text.
for (index = 0; index < text.Length; ++index)
{
// Update the state <c>R</c> by ORing the pattern mask with the character in the text,
// and then shift it to the left by 1.
r |= patternMask[text[index]];
r <<= 1;
// Check if the least significant bit of <c>R</c> is set to 1.
// If there's a potential match at that position, with a mismatch distance of 0,
// return the index of the first occurrence of the pattern in the text.
if ((r & 1 << len) == 0)
{
return index - len + 1;
}
}
// If no match is found, return -1.
return -1;
}
/// <summary>
/// Finds the first occurrence of a pattern in a given text with a given threshold for mismatches.
/// </summary>
/// <param name="text">The text to search in.</param>
/// <param name="pattern">The pattern to search for.</param>
/// <param name="threshold">The maximum number of mismatches allowed.</param>
/// <returns>The index of the first occurrence of the pattern in the text, or -1 if not found.</returns>
public static int FindFuzzyPattern(string text, string pattern, int threshold)
{
// Create a pattern mask for each character in the pattern.
// The pattern mask is a bitmask that we will use to search for the pattern characters
// in the text. We'll set the bit corresponding to the character in the pattern
// to 0, and then use bitwise operations to check for the pattern.
var patternMask = new int[128];
// Create a register array.
// The register array is used to keep track of the pattern mask as we search for the pattern.
// We'll start with a register that has all bits set to 1, because all bits in the pattern mask
// will be set to 1 initially.
var r = new int[(threshold + 1) * sizeof(int)];
var len = pattern.Length;
// Check for empty strings.
// If the text is empty, return 0.
// If the pattern is empty, return 0.
if (string.IsNullOrEmpty(text))
{
return 0;
}
if (string.IsNullOrEmpty(pattern))
{
return 0;
}
// Check for a pattern that is too long.
// If the pattern is longer than 31 characters, return -1.
// The maximum length of the pattern is 31, because if it's longer than that,
// we won't be able to represent the pattern mask in an int.
if (len > 31)
{
return -1;
}
// Initialize the register.
// Set the least significant bit in the register to 0 or 1
// depending on whether the current character in the text matches the pattern.
// This will make it easier to check for the pattern later.
for (var i = 0; i <= threshold; ++i)
{
r[i] = ~1;
}
// Initialize the pattern mask.
// Set the bit corresponding to each character in the pattern to 0 in the pattern mask.
// This will make it easier to check for the pattern later.
for (var i = 0; i <= 127; i++)
{
patternMask[i] = ~0;
}
// Set the pattern mask for each character in the pattern.
// Use bitwise AND to clear the bit corresponding to the current character.
for (var i = 0; i < len; ++i)
{
patternMask[pattern[i]] &= ~(1 << i);
}
// Search for the pattern in the text.
// Loop through each character in the text.
for (var i = 0; i < text.Length; ++i)
{
// Update the register.
// Set the least significant bit in the register to 0 or 1
// depending on whether the current character in the text matches the pattern.
// This will make it easier to check for the pattern later.
var oldR = r[0];
r[0] |= patternMask[text[i]];
r[0] <<= 1;
// Update the other registers.
// Set the least significant bit in each register to 0 or 1
// depending on whether the current character in the text matches the pattern.
// This will make it easier to check for the pattern later.
for (var j = 1; j <= threshold; ++j)
{
var tmp = r[j];
r[j] = (oldR & (r[j] | patternMask[text[i]])) << 1;
oldR = tmp;
}
// If the pattern has been found, return the index.
// Check the most significant bit in the register.
// If it's 0, then the pattern has been found.
if ((r[threshold] & 1 << len) == 0)
{
// The pattern has been found.
// Return the index of the first character in the pattern.
return i - len + 1;
}
}
// The pattern has not been found.
return -1;
}
}