From fa120bdf714f8ebcec63dc90f99e98b0945beeb0 Mon Sep 17 00:00:00 2001 From: kekeandzeyu Date: Fri, 13 Sep 2024 18:33:11 +0800 Subject: [PATCH] add more notes in data structures and algorithms --- .../images_data/{d30-3-1.png => d21-3-1.png} | Bin .../Data-Structures-and-Algorithms-3.md | 329 +++++++++--------- 2 files changed, 159 insertions(+), 170 deletions(-) rename Writerside/images_data/{d30-3-1.png => d21-3-1.png} (100%) diff --git a/Writerside/images_data/d30-3-1.png b/Writerside/images_data/d21-3-1.png similarity index 100% rename from Writerside/images_data/d30-3-1.png rename to Writerside/images_data/d21-3-1.png diff --git a/Writerside/topics/Data-Structures-and-Algorithms-3.md b/Writerside/topics/Data-Structures-and-Algorithms-3.md index aa2206c..24a6602 100644 --- a/Writerside/topics/Data-Structures-and-Algorithms-3.md +++ b/Writerside/topics/Data-Structures-and-Algorithms-3.md @@ -1675,25 +1675,38 @@ class TernarySearchTree: ### 21.1 Introduction - -
  • Goal: Find pattern of length M in text of length N (typically -N >> M).

    +N \leq \leq M).

    + +

    Applications:

    + + +
  • +

    Find & replace

    +
  • +
  • +

    Computer forensics

    +
  • +
  • +

    Identify patterns indicative of spam

    +
  • +
  • +

    Electronic surveillance

  • -

    Applications: Find & replace, -computer forensics, identify patterns indicative of spam, -electronic surveillance, screen scraping, etc.

    +

    Screen scraping

  • ### 21.2 Brute-Force Substring Search -* Theoretical challenge: Linear-time guarantee. - (Worst case: \sim MN) -* Practical challenge: Avoid backup in text stream. (Brute-force - algorithm needs backup for every mismatch) + +
  • Theoretical challenge: Linear-time guarantee. +(Worst case: \sim MN)
  • +
  • Practical challenge: Avoid backup in text stream. (Brute-force +algorithm needs backup for every mismatch)
  • +
    Java @@ -1796,138 +1809,114 @@ def brute_force_search(main_string, sub_string): #### 21.3.1 Proposition -* KMP substring search accesses no more than M + N - chars to search for a pattern of length M in a text of - length N. +

    Property: KMP substring search +accesses no more than M + N chars to search for a pattern +of length M in a text of length N.

    -> Proof: Each pattern char accessed once when constructing DFA; -> each text char accessed once (in the worst case) when simulating -> DFA. -> -{style = "tip"} +

    Proof: Each pattern char +accessed once when constructing DFA; each text char accessed once (in +the worst case) when simulating DFA.

    -* KMP constructs `dfa[][]` in time and space proportional to RM, - where R is the alphabet size and M is the pattern - length. - -> Improved version of KMP constructs `nfa[]` in time and space -> proportional to M. -> -{style = "tip"} +

    Property: KMP constructs +dfa[][] in time and space proportional to RM +, where R is the alphabet size and M is the +pattern length.

    #### 21.3.2 DFA -Deterministic Finite State Automaton (DFA) is an abstract -string-search machine. +

    Deterministic Finite State Automaton +(DFA) is an abstract string-search machine.

    -* Finite number of states (including start and halt). -* Exactly one transition for each char in alphabet. -* Accept if sequence of transitions lead to halt state. + +
  • +

    Finite number of states (including start and halt).

    +
  • +
  • +

    Exactly one transition for each char in alphabet.

    +
  • +
  • +

    Accept if sequence of transitions lead to halt state.

    +
  • +
    -Alt text +DFA -DFA state = number of characters in pattern that have been matched (length -of longest prefix of `pat[]` that is a suffix of `txt[0...i]`). + +

    DFA state = number of characters in pattern that have been matched +(length of longest prefix of pat[] that is a suffix of +txt[0...i]).

    +
    -To compute DFA: If in state j and next char `c != pat.charAt(j)`, -then the last j - 1 characters of input are `pat[1...j - 1]`, -followed by `c`. Simulate `pat[1...j - 1]` on DFA and take transition c. + + +

    If in state j (first j characters of + pattern have already been matched and next char c == pat. + charAt(j) (next char matches), go to j+1 (now + first j+1 characters of pattern have been matched). +

    +
    + +

    If in state j and next char c != pat.charAt + (j), then the last j-1 characters of input are + pat[1...j - 1], followed by c. Simulate + pat[1...j - 1] on DFA and take transition c (only longest + possible matched suffix now lies pat[1...j - 1] + followed by c).

    +
    +
    -For each state j and char `c != pat.charAt(j)`, set `dfa[c][j] = dfa[c][X]`, -then update `X = dfa[pat.charAt(j)][X]`. X is the simulation of `pat[1...j - 1]` on DFA. + +

    Use state X to simulate pat[1...j-1], takes only +constant time!

    +
    -> This is the implementation using DFA. -> -{style = "note"} + + +

    Copy dfa[][X] to dfa[][j] for + mismatch case.

    +
    + +

    Set dfa[pat.charAt(j)][j] to j+1 for + match case.

    +
    + +

    Update X.

    +
    +
    -Java (Princeton) +Java ```Java public class KMP { - private final int R; // the radix - private final int m; // length of pattern - private final int[][] dfa; // the KMP automaton - - /** - * Preprocesses the pattern string. - * - * @param pat the pattern string - */ - public KMP(String pat) { - this.R = 256; - this.m = pat.length(); - - // build DFA from pattern - dfa = new int[R][m]; - dfa[pat.charAt(0)][0] = 1; - for (int x = 0, j = 1; j < m; j++) { - for (int c = 0; c < R; c++) - dfa[c][j] = dfa[c][x]; // Copy mismatch cases. - dfa[pat.charAt(j)][j] = j+1; // Set match case. - x = dfa[pat.charAt(j)][x]; // Update restart state. - } - } - - /** - * Preprocesses the pattern string. - * - * @param pattern the pattern string - * @param R the alphabet size - */ - public KMP(char[] pattern, int R) { - this.R = R; - this.m = pattern.length; - - // build DFA from pattern - int m = pattern.length; - dfa = new int[R][m]; - dfa[pattern[0]][0] = 1; - for (int x = 0, j = 1; j < m; j++) { - for (int c = 0; c < R; c++) - dfa[c][j] = dfa[c][x]; // Copy mismatch cases. - dfa[pattern[j]][j] = j+1; // Set match case. - x = dfa[pattern[j]][x]; // Update restart state. - } - } - - /** - * Returns the index of the first occurrence of the pattern string - * in the text string. - * - * @param txt the text string - * @return the index of the first occurrence of the pattern string - * in the text string; N if no such match - */ - public int search(String txt) { - - // simulate operation of DFA on text - int n = txt.length(); - int i, j; - for (i = 0, j = 0; i < n && j < m; i++) { - j = dfa[txt.charAt(i)][j]; + private final int[][] dfa; + private final String pattern; + + public KMP(String pattern) { + this.pattern = pattern; + int M = pattern.length(); + int R = 256; + + dfa = new int[R][M]; + dfa[pattern.charAt(0)][0] = 1; + + for (int X = 0, j = 1; j < M; j++) { + for (int c = 0; c < R; c++) { + dfa[c][j] = dfa[c][X]; + } + dfa[pattern.charAt(j)][j] = j + 1; + X = dfa[pattern.charAt(j)][X]; } - if (j == m) return i - m; // found - return n; // not found } - /** - * Returns the index of the first occurrence of the pattern string - * in the text string. - * - * @param text the text string - * @return the index of the first occurrence of the pattern string - * in the text string; N if no such match - */ - public int search(char[] text) { - - // simulate operation of DFA on text - int n = text.length; + public int search(String text) { + int M = pattern.length(); + int N = text.length(); int i, j; - for (i = 0, j = 0; i < n && j < m; i++) { - j = dfa[text[i]][j]; + for (i = 0, j = 0; i < N && j < M; i++) { + j = dfa[text.charAt(i)][j]; } - if (j == m) return i - m; // found - return n; // not found + if (j == M) return i - M; + else return N; } } ``` @@ -1935,44 +1924,42 @@ public class KMP { C++ ```C++ -#include +#include #include +#include class KMP { private: - int R; // the radix - int m; // length of pattern - std::vector> dfa; // the KMP automaton + std::vector> dfa; + std::string pattern; public: - // Preprocesses the pattern string. - KMP(std::string pat) { - this->R = 256; - this->m = pat.length(); + explicit KMP(const std::string& pattern) : pattern(pattern) { + const int M = static_cast(pattern.length()); + constexpr int R = 256; - // build DFA from pattern - dfa = std::vector>(R, std::vector(m)); - dfa[pat[0]][0] = 1; - for (int x = 0, j = 1; j < m; j++) { - for (int c = 0; c < R; c++) - dfa[c][j] = dfa[c][x]; // Copy mismatch cases. - dfa[pat[j]][j] = j+1; // Set match case. - x = dfa[pat[j]][x]; // Update restart state. + dfa.resize(R, std::vector(M)); + dfa[pattern[0]][0] = 1; + + for (int X = 0, j = 1; j < M; j++) { + for (int c = 0; c < R; c++) { + dfa[c][j] = dfa[c][X]; + } + dfa[pattern[j]][j] = j + 1; + X = dfa[pattern[j]][X]; } } - // Returns the index of the first occurrence of the pattern string - // in the text string. - int search(std::string txt) { - - // simulate operation of DFA on text - int n = txt.length(); + [[nodiscard]] int search(const std::string& text) const + { + const int M = static_cast(pattern.length()); + const int N = static_cast(text.length()); int i, j; - for (i = 0, j = 0; i < n && j < m; i++) { - j = dfa[txt[i]][j]; + for (i = 0, j = 0; i < N && j < M; i++) { + j = dfa[text[i]][j]; } - if (j == m) return i - m; // found - return n; // not found + if (j == M) return i - M; + else return N; } }; ``` @@ -1981,30 +1968,32 @@ Python ```Python class KMP: - def __init__(self, pat): - self.R = 256 # the radix - self.m = len(pat) # length of pattern - - # build DFA from pattern - self.dfa = [[0 for _ in range(self.m)] for _ in range(self.R)] - self.dfa[ord(pat[0])][0] = 1 - x = 0 - for j in range(1, self.m): - for c in range(self.R): - self.dfa[c][j] = self.dfa[c][x] # Copy mismatch cases. - self.dfa[ord(pat[j])][j] = j + 1 # Set match case. - x = self.dfa[ord(pat[j])][x] # Update restart state. - - def search(self, txt): - # simulate operation of DFA on text - n = len(txt) + def __init__(self, pattern): + self.pattern = pattern + M = len(pattern) + R = 256 + + self.dfa = [[0] * M for _ in range(R)] + self.dfa[ord(pattern[0])][0] = 1 + + X = 0 + for j in range(1, M): + for c in range(R): + self.dfa[c][j] = self.dfa[c][X] + self.dfa[ord(pattern[j])][j] = j + 1 + X = self.dfa[ord(pattern[j])][X] + + def search(self, text): + M = len(self.pattern) + N = len(text) i, j = 0, 0 - while i < n and j < self.m: - j = self.dfa[ord(txt[i])][j] + while i < N and j < M: + j = self.dfa[ord(text[i])][j] i += 1 - if j == self.m: - return i - self.m # found - return n # not found + if j == M: + return i - M + else: + return N ``` #### 21.3.3 NFA