String matching

Searching for a pattern is a fundamental problem when dealing with text

Example

Formally

Brute force approach

Nested scan from left to right in t

def stringmatch(t,p):
    poslist = []
    for i in range(len(t)-len(p)+1):
        matched = True
        j = 0
        while j < len(p) and matched:
            if t[i+j] != p[j]:
                matched = False
            j = j+1
        if matched:
            poslist.append(i)
    return(poslist)
print(stringmatch('abababbababbbbababab','abab'))

Output

[0, 2, 7, 14, 16]

Complexity

 

Nested scan from right to left

def stringmatchrev(t,p):
    poslist = []
    for i in range(len(t)-len(p)+1):
        matched = True
        j = len(p)-1
        while j >= 0 and matched:
            if t[i+j] != p[j]:
                matched = False
            j = j-1
        if matched:
            poslist.append(i)
    return(poslist)
print(stringmatchrev('abababbababbbbababab','abab'))

Output

[0, 2, 7, 14, 16]

Complexity

 

Speeding up the brute force algorithm

 

Boyer-Moore Algorithm

Algorithm

Implementation

def boyermoore(t,p):
    last = {} # Preprocess
    for i in range(len(p)):
        last[p[i]] = i
    poslist=[]
    i = 0
    while i <= (len(t)-len(p)):
        matched,j = True,len(p)-1
        while j >= 0 and matched:
            if t[i+j] != p[j]:
                matched = False
            j = j - 1
        if matched:
            poslist.append(i)
            i = i + 1
        else:
            j = j + 1
            if t[i+j] in last.keys():
                i = i + max(j-last[t[i+j]],1)
            else:
                i = i + j + 1
    return(poslist)
print(boyermoore('abcaaacabc','abc'))

Output

[0, 7]

Complexity

Worst case remains

If t = aaa...a, p = baaa

 

Rabin-Karp Algorithm

Implementation

def rabinkarp(t,p):
    poslist = []
    numt,nump = 0,0
    for i in range(len(p)):
        numt = 10*numt + int(t[i])
        nump = 10*nump + int(p[i])
    if numt == nump:
        poslist.append(0)
    for i in range(1,len(t)-len(p)+1):
        numt = numt - int(t[i-1])*(10**(len(p)-1))
        numt = 10*numt + int(t[i+len(p)-1])
        if numt == nump:
            poslist.append(i)
    return(poslist)
print(rabinkarp('233323233454323','23'))

Output

[0, 4, 6, 13]

Analysis

 

Rabin Karp Implementation for strings

def rabin_karp(text, pattern):
    match_found =[]
    n = len(text)
    m = len(pattern)    
    # Prime number to use for the hash function
    prime = 101   
    # Calculate the hash value of the pattern
    pattern_hash = 0
    for i in range(m):
        pattern_hash += ord(pattern[i])
    pattern_hash = pattern_hash % prime
    
    # Calculate the hash value of the first substring of the text
    text_hash = 0
    for i in range(m):
        text_hash += ord(text[i])
    text_hash = text_hash % prime
    # Iterate through the text, checking for matches with the pattern
    for i in range(n - m + 1):
        # Check if the current substring matches the pattern
        if text_hash == pattern_hash and text[i:i+m] == pattern:
            match_found.append(i)       
        # Calculate the hash value of the next substring
        if i < n - m:
            text_hash = (text_hash - ord(text[i]) + ord(text[i+m]))
            text_hash = text_hash % prime
    # No match found
    return match_found
text = 'abcdbabcdb'
pattern = 'abcdb'
print(rabin_karp(text, pattern))

Output

[0, 5]

 

 

Knuth-Morris-Pratt algorithm

Computing the fail function

Implementation of fail function

def kmp_fail(p):
    m = len(p)
    fail = [0 for i in range(m)]
    j,k = 1,0
    while j < m:
        if p[j] == p[k]:
            fail[j] = k+1
            j,k = j+1,k+1
        elif k > 0:
            k = fail[k-1]
        else:
            j = j+1
    return(fail)
print(kmp_fail('abcaabca'))

Output

[0, 0, 0, 1, 1, 2, 3, 4]

Complexity

 

Implementation of KMP algorithm

def kmp_fail(p):
    m = len(p)
    fail = [0 for i in range(m)]
    j,k = 1,0
    while j < m:
        if p[j] == p[k]:
            fail[j] = k+1
            j,k = j+1,k+1
        elif k > 0:
            k = fail[k-1]
        else:
            j = j+1
    return(fail)

def find_kmp(t, p):
    match =[]
    n,m = len(t),len(p)
    if m == 0:
        match.append(0)
    fail = kmp_fail(p)
    j = 0
    k = 0
    while j < n:
        if t[j] == p[k]:
            if k == m - 1:
                match.append(j - m + 1)
                k = 0
                j = j - m + 2
            else:
                j,k = j+1,k+1
        elif k > 0:
            k = fail[k-1]
        else:
            j = j+1
    return(match)
print(find_kmp('ababaabbaba','aba'))

Output

[0, 2, 8]

Analysis

 

Tries

Implementation of Tries

class Trie:
    def __init__(self,S=[]):
        self.root = {}
        for s in S:
            self.add(s)
    def add(self,s):
        curr = self.root
        s = s + "$"
        for c in s:
            if c not in curr.keys():
                curr[c] = {}
            curr = curr[c]
    def query(self,s):
        curr = self.root
        for c in s:
            if c not in curr.keys():
                return(False)
            curr = curr[c]
        if "$" in curr.keys():
            return(True)
        else:
            return(False)
        
T = Trie()
T.add('car')
T.add('card')
T.add('care')
T.add('dog')
T.add('done')
print(T.query('dog'))
print(T.query('cat'))

Output

True
False

Analysis

 

Suffix Tries

Implementation of suffix tries

class SuffixTrie:
    def __init__(self,s):
        self.root = {}
        s = s + "$"
        for i in range(len(s)):
            curr = self.root
            for c in s[i:]:
                if c not in curr.keys():
                    curr[c] = {}
                curr = curr[c]
    def followPath(self,s):
        curr = self.root
        for c in s:
            if c not in curr.keys():
                return(None)
            curr = curr[c]
        return(curr)
    def hasSuffix(self,s):
        node = self.followPath(s)
        return(node is not None and "$" in node.keys())
ST = SuffixTrie('card')
print(ST.root)
print(ST.followPath('a'))
print(ST.hasSuffix('aa'))

Output

{'r': {'d': {'$': {}}}}
False

 

Topics