String matching algorithm, permutation algorithm

2024-03-22 13:30:46 +08:00 · 2018-12-11 15:28:05 +08:00 · 2018-12-11 15:28:05 +08:00 · 166cd2737b
commit 166cd2737b
parent 3b8fa1782b
19 changed files with 479 additions and 36 deletions
--- a/dataStructure/intervalTree.py
+++ b/dataStructure/intervalTree.py
@ -84,7 +84,7 @@ def genNum(n =10,upper=10):
    return nums.values()

 def buildTree(n=10,nums=None,visitor=None):
-    if nums is None or nums ==[]: nums = genNum(n)
+    #if nums is None or nums ==[]: nums = genNum(n)
    tree = intervalTree()
    print(f'build a red-black tree using {nums}')
    for i in nums:
@ -100,6 +100,7 @@ def testInsert(nums=None):
    print('-'*5+ 'in-order visit' + '-'*5)
    for i,j in enumerate(tree.sort()):
        print(f'{i+1}: {j}')
+    return tree

 def testSuc(nums=None):
    tree,nums = buildTree(nums=nums)
@ -113,10 +114,16 @@ def testDelete(nums=None):
        print(f'deleting {i}')
        tree.delete(i[0])
        print(tree)
+    return tree

 if __name__=='__main__':
    lst = [(0,3),(5,8),(6,10),(26,26),(25,30),(8,9),(19,20),(15,23),(16,21),(17,19)]
-    lst = None
+    #lst = None
    #testSuc(lst)
-    #testInsert(lst)
-    testDelete(lst)
+    tree = testInsert(lst)
+    #tree,_= buildTree(lst)
+    while 1:
+        a =int( input('low:'))
+        b =int( input('high:'))
+        res = tree.search(a,b)
+        print(res)
--- a/dataStructure/redBlackTree.py
+++ b/dataStructure/redBlackTree.py
@ -286,6 +286,7 @@ def buildTree(n=10,nums=None,visitor=None):
    print(f'build a red-black tree using {nums}')
    for i in nums:
        rbtree.insert(node(i))
+        print(rbtree)
        if visitor:
            visitor(rbtree,i)
    return rbtree,nums
--- a/divideAndConquer/min_distance_of_n_points.py
+++ b/divideAndConquer/min_distance_of_n_points.py
@ -107,18 +107,18 @@ def test(f=minDistance_n2):
    print('result: {:.2f} {} {}\n'.format(minD, p,q))

 def genData(n,unique=True):
+    upper = 1000000
    if unique:
        points = set()
        for i in range(n):
-            points.add(point(randint(1,1000),randint(1,1000)))
+            points.add(point(randint(1,upper),randint(1,upper)))
        return list(points)
-    else:return [point(randint(1,1000),randint(1,1000)) for i in range(n)]
+    else:return [point(randint(1,upper),randint(1,upper)) for i in range(n)]

 if __name__ =='__main__':
-    n = 10000
+    n = 1000
    points = genData(n, unique=True)
    print('min distance of {} points'.format(n))
    #print(sorted(points))
    test(minDistance_n2)
    test(minDistance_nlogn)
-    
--- a/dynamicProgramming/lcs.hs
+++ b/dynamicProgramming/lcs.hs
@ -1,8 +0,0 @@
-import Vec2d (Vec2d,getVal,setVal)
-    
-
-lcs a b = 
-    let m = lenghth a
-        n = length b
-        rst = []
-    in 1 --to do
--- a/dynamicProgramming/lcs.py
+++ b/dynamicProgramming/lcs.py
@ -29,17 +29,22 @@ def lcs2(a,b):
    m,n= len(a),len(b)
    board = [[] for i in range(n+1)]
    for i in range(m):
-        last = []
+        upperLevel = board[0].copy()
        for j in range(n):
+            tmp = board[j+1].copy()
            if a[i]==b[j]:
-                board[j+1] =board[j]+[a[i]]
-            elif len(board[j+1]) < len(last):
-                board[j+1] = last
-            last = board[j+1]
+                board[j+1] = upperLevel+[a[i]]
+            elif len(board[j+1]) < len(board[j]):
+                board[j+1] = board[j].copy() # copy is needed
+            upperLevel = tmp
    return board[n]

 if __name__ =='__main__':
-    a="dsaffqewqfqewregqwefqwe"
-    b="adsfsfs3qt5yhyh24efwq"
-    print(lcs(a,b))
-    print(lcs2(a,b))
+    a = 'ABCBDAB'
+    b = 'BDCABA'
+    print('s1:',a)
+    print('s2:',b)
+    while 1:
+        print('lcs:',lcs2(a,b))
+        a = input('s1: ')
+        b = input('s2: ')
--- a/math/permute_back_track.py
+++ b/math/permute_back_track.py
@ -0,0 +1,12 @@
+def permute(n):
+    def _util(lst,i):
+        if i==n:print(lst)
+        else:
+            for j in range(i,n):
+                lst[i],lst[j]=lst[j],lst[i]
+                _util(lst,i+1)
+                lst[i],lst[j]=lst[j],lst[i]
+    _util([i for i in range(n)],0)
+
+if __name__=='__main__':
+    permute(5)
--- a/math/permute_cantor.c
+++ b/math/permute_cantor.c
@ -11,7 +11,7 @@ void calFac(int n)
    }
 }

-void getArrangement(int *arr,int n,int sum)
+void permute(int *arr,int n,int sum)
 {
    /*sum表示全排列由小到大排序后的名次,从0 开始计数, 由名次求出 n位的排列存储到 arr 中*/
    int i,j,ct=0,k, ct2;
@ -36,3 +36,21 @@ void getArrangement(int *arr,int n,int sum)
    }
 }

+void printArr(int *p,int n)
+{
+    for(int i=0;i<n;++i)printf("%d, ",p[i]);
+    printf("\n");
+}
+
+int main()
+{
+    int n = 5,arr[n];
+    calFac(n);
+    for(int i=0;i<5;++i)arr[i]=i; 
+    for(int i=0;i<fac[n];++i){
+        printArr(arr,n);
+        permute(arr,n,i);
+    }
+    return 0;
+}
+        
--- a/math/permute_divide_and_conquer.py
+++ b/math/permute_divide_and_conquer.py
@ -0,0 +1,12 @@
+def permute(lst,n):
+    ''' O(n!), optimal'''
+    if n==1:print(lst)
+    else:
+        for i in range(n):
+            lst[i],lst[n-1] = lst[n-1],lst[i]
+            permute(lst,n-1)
+            lst[i],lst[n-1] = lst[n-1],lst[i]
+
+if __name__=='__main__':
+    n = 3
+    permute([i for i in range(n)],n)
--- a/math/permute_next_arrangement.c
+++ b/math/permute_next_arrangement.c
--- a/math/primesLEn.hs
+++ b/math/primesLEn.hs
@ -0,0 +1,3 @@
+genPrimes 2= [2]
+genPrimes n = let li = genPrimes $n-1
+              in  if all (\x-> mod n x /=0) li then n:li else li
--- a/search/BFS_knight.hs
+++ b/search/BFS_knight.hs
@ -0,0 +1,34 @@
+{- mbinary
+#########################################################################
+# File : BFS_knight.hs
+# Author: mbinary
+# Mail: zhuheqin1@gmail.com
+# Blog: https://mbinary.coding.me
+# Github: https://github.com/mbinary
+# Created Time: 2018-11-11  19:40
+# Description: 
+#########################################################################
+-}
+{-
+Given two different positions on a chess board, find the least number of moves it would take a knight to get from one to the other. The positions will be passed as two arguments in algebraic notation. For example, knight("a3", "b5") should return 1.
+
+The knight is not allowed to move off the board. The board is 8x8.
+-}
+
+module ShortestKnightPath.Kata (knight) where
+import Data.Char
+import Data.List
+knight :: String -> String -> Int
+knight s1 s2  = let begin = axis s1
+                    end =  axis s2
+                    notEnd = all (\tp->tp /=end) 
+                in length . takeWhile notEnd .iterate gen $[begin]
+
+gen li = nub. flatten $map (filter  (\(a,b) ->a>0 && b>0 &&a<9&&b<9 ) . change)  li
+change (a,b) = [(a-1,b-2),(a-1,b+2),(a+1,b-2),(a+1,b+2),(a+2,b-1),(a+2,b+1),(a-2,b+1),(a-2,b-1)]
+
+axis s = (ord (s!!0) -96, digitToInt (s!!1)::Int)
+
+flatten [] = []
+flatten (x:xs) = x ++ flatten xs
+
--- a/search/binary_search.hs
+++ b/search/binary_search.hs
@ -0,0 +1,7 @@
+search  i li= binary 0  $length li -1
+             where binary a b= let mid = div (a+b) 2
+                                   p = li!!mid 
+                               in  if a>=b then a
+                                   else if p==i then mid
+                                   else if p>i then binary a $mid-1
+                                   else   binary (mid+1) b
--- a/search/schedule.py
+++ b/search/schedule.py
@ -0,0 +1,104 @@
+'''
+回溯全空间搜索, 剪枝优化
+
+
+设有n个任务由k个可并行工作的机器来完成，完成任务i需要时间为 。试设计一个算法找出完成这n个任务的最佳调度，使完成全部任务的时间最早。
+'''
+from time import time
+from functools import total_ordering
+@total_ordering
+class record:
+    def __init__(self,nums=None):
+        if nums is None:
+            nums=[]
+        self.nums=nums
+        self.sum = sum(nums)
+    def append(self,x):
+        self.nums.append(x)
+        self.sum+=x
+    def pop(self):
+        x = self.nums.pop()
+        self.sum-=x
+        return x
+    def __repr__(self):
+        return repr(self.nums)
+    def __lt__(self,r):
+        return self.sum<r.sum
+    def __eq__(self,r):
+        return self.sum==r.sum
+    def tolist(self):
+        return self.nums.copy()
+    def __hash__(self):
+        return self.sum
+def schedule(works,k):
+    def backtrackSearch(i,lsts):
+        nonlocal best,rst
+        if i==n:
+            cost = max(r.sum for r in lsts )
+            if best>cost:
+                best= cost
+                rst = [st.tolist() for st in lsts]
+        else:
+            for cur in set(lsts):
+                if best>cur.sum+works[i]:
+                    cur.append(works[i])
+                    backtrackSearch(i+1,lsts)
+                    cur.pop()
+    def findInitial(i,lst):
+        nonlocal best
+        if i==n:
+            cost = max(lst)
+            if best>cost:best = cost
+        else:
+            mn = lst[0]
+            idx = 0
+            visited=set()
+            for j,cur in enumerate(lst):
+                if cur not in visited:
+                    visited.add(cur)
+                    if mn>cur:
+                        mn = cur
+                        idx = j
+            lst[idx]+=works[i]
+            findInitial(i+1,lst)
+            lst[idx]-=works[i]
+
+
+    n = len(works)
+    print()
+    print('machine Num:',n)
+    print('works      :',works)
+    rst =  None
+    works.sort(reverse=True) # key step
+    best = sum(works[:n-k+1])
+    t = time()
+    findInitial(0,[0]*k) # key step
+    t1 = time()-t
+    print('init  solution: {}    cost time {:.6f}s'.format(best,t1))
+    t = time()
+    backtrackSearch(0,[record() for i in range(k)])
+    t2 = time()-t
+    print('final solution: {}    cost time {:.6f}s'.format(best,t2))
+    print('schedule  plan:',rst)
+    return best,rst
+
+if __name__=='__main__':
+    from random import randint
+    schedule([47,20,28,44,21,45,30,39,28,33],3)
+    schedule([98,84,50,23,32,99,22,76,72,61,81,39,76,54,37],5)
+    schedule([39,39,23,45,100,69,21,81,39,55,20,86,34,53,58,99,36,45,46],8)
+
+'''
+machine Num: 19
+works       : [39, 39, 23, 45, 100, 69, 21, 81, 39, 55, 20, 86, 34, 53, 58, 99, 36, 45, 46]
+
+works  经过逆序排序
+init  solution: 135    cost time 0.000196s
+final solution: 126    cost time 0.022922s
+schedule  plan: [[100, 21], [99, 23], [86, 39], [81, 45], [69, 53], [58, 45, 20], [55, 36, 34], [46, 39, 39]]
+
+works 没有经过排序
+init  solution: 168    cost time 0.000179s
+final solution: 126    cost time 10.646307s
+schedule  plan: [[39, 86], [39, 34, 53], [23, 99], [45, 39, 36], [100, 20], [69, 55], [21, 58, 46], [81, 45]]
+'''
--- a/string/KMP.py
+++ b/string/KMP.py
@ -0,0 +1,58 @@
+#coding: utf-8
+''' mbinary
+#########################################################################
+# File : KMP.py
+# Author: mbinary
+# Mail: zhuheqin1@gmail.com
+# Blog: https://mbinary.coding.me
+# Github: https://github.com/mbinary
+# Created Time: 2018-12-11  14:02
+# Description:
+#########################################################################
+'''
+
+def getPrefixFunc(s):
+    '''return the list of prefix function of s'''
+    length = 0
+    i = 1
+    n = len(s)
+    ret = [0]
+    while i<n:
+        if s[i]==s[length]:
+            length +=1
+            ret.append(length)
+            i+=1
+        else:
+            if length==0:
+                ret.append(0)
+                i+=1
+            else:
+                length = ret[length-1]
+    return ret
+
+def findAll(s,p):
+    pre = getPrefixFunc(p)
+    i = j  =0
+    n,m = len(s),len(p)
+    ret = []
+    while i<n:
+        if s[i]==p[j]:
+            i+=1
+            j+=1
+            if j==m:
+                ret.append(i-j)
+                j=pre[j-1]
+        else:
+            if j==0: i+=1
+            else: j = pre[j-1]
+    return ret
+def randStr(n=3):
+    return [randint(ord('a'),ord('z')) for i in range(n)]
+
+if __name__ =='__main__':
+    from random import randint
+    s = randStr(50)
+    p = randStr(1)
+    print(s)
+    print(p)
+    print(findAll(s,p))
--- a/string/README.md
+++ b/string/README.md
@ -0,0 +1,110 @@
+# String Matching algorithm
+
+![](https://upload-images.jianshu.io/upload_images/7130568-e10dc137e9083a0e.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
+
+## Rabin-Karp
+We can view a string of k characters (digits) as a length-k decimal number.  E.g., the string “31425” corresponds to the decimal number 31,425.
+- Given a pattern P [1..m], let p denote the corresponding decimal value.
+- Given a text T [1..n], let $t_s$ denote the decimal value of the length-m substring  T [(s+1)..(s+m)] for s=0,1,…,(n-m).
+- let `d` be the radix of num, thus $d = len(set(s))$
+- $t_s$ = p iff T [(s+1)..(s+m)] = P [1..m].
+- p can be computed in O(m) time. p = P[m] + d\*(P[m-1] + d\*(P[m-2]+…)).
+- t0 can similarly be computed in O(m) time.
+- Other $t_1,\ldots,t_{n-m}$ can be computed in O(n-m) time since $t_{s+1} can be computed from ts in constant time.
+Namely, 
+
+$$ 
+t_{s+1} = d*(t_s-d^{m-1} * T[s+1])+T[s+m+1]
+$$
+However, it's no need to calculate $t_{s+1}$ directly. We can use modulus operation to reduce the work of caculation.
+
+We choose a small prime number. Eg 13 for radix( noted as d) 10.
+Generally, d\*q should fit within one computer word.
+
+We firstly caculate t0 mod q.
+Then, for every $t_i (i>1)$
+assume
+$$
+ t_{i-1} = T[i+m-1] + 10*T[i+m-2]+\ldots+10^{m-1}*T[i-1]
+$$
+denote $ d' = d^{m-1}\ mod\ q$
+thus,
+$$
+\begin{aligned}
+t_i &= (t_{i-1} - d^{m-1}*T[i-1]) * d + T[i+m]\\
+&\equiv (t_{i-1} - d^{m-1}*T[i-1]) * d + T[i+m] (mod\ q)\\
+&\equiv (t_{i-1}- ( d^{m-1} mod \ q) *T[i-1]) * d + T[i+m] (mod\ q)\\
+&\equiv (t_{i-1}- d'*T[i-1]) * d + T[i+m] (mod\ q)
+\end{aligned}
+$$
+
+So we can compare the modular value of each ti with p's.
+Only if they are the same, then we compare the origin chracter, namely $T[i],T[i+1],\ldots,T[i+m-1]$ and the pattern.
+Gernerally, this algorithm's time approximation is O(n+m), and the worst case is O((n-m+1)\*m)
+
+**Problem: this is assuming p and ts are small numbers. They may be too large to work with easily.**
+
+## FSM
+A FSM can be represented as (Q,q0,A,S,C), where
+- Q is the set of all states
+- q0 is the start state
+- $A\in Q$ is a set of accepting states.
+- S is a finite input alphabet.
+- C is the set of transition functions: namely  $q_j = c(s,q_i)$.
+
+Given a pattern string S, we can build a FSM for string matching.
+Assume S has m chars, and there should be m+1 states. One is for the begin state, and the others are for matching state of each position of S.
+
+Once we have built the FSM, we can run it on any input string.
+## KMP
+>Knuth-Morris-Pratt method
+
+The idea is inspired by FSM. We can avoid computing the transition functions. Instead, we compute a prefix functi`Next` on P in O(m) time, and  Next has only m entries.
+> Prefix funtion stores info about how the pattern matches against shifts of itself.
+
+- String w is a prefix of string x, if x=wy for some string y
+- String w is a suffix of string x, if x=yw for some string y
+- The k-character prefix of the pattern P [1..m] denoted by Pk.
+- Given that pattern prefix P [1..q] matches text characters T [(s+1)..(s+q)], what is the least shift s'> s such that P [1..k] = T [(s'+1)..(s'+k)] where s'+k=s+q?
+- At the new shift s', no need to compare the first k characters of P with corresponding characters of T.
+Method: For prefix pi, find the longest proper prefix of pi that is also a suffix of pi.
+next[q] = max{k|k\<q and pk is a suffix of pq}
+
+For example:  p = ababaca,  for p5 = ababa, Next[5] = 3. Namely p3=aba is the longest prefix of p that is also a suffix of p5.
+
+Time approximation: finding prefix function `next` take O(m), matching takes O(m+n)
+
+## Boyer-Moore
+- The longer the pattern is, the faster it works.
+- Starts from the end of pattern, while KMP starts from the beginning.
+- Works best for character string, while KMP works best for binary string.
+- KMP and Boyer-Moore
+  - Preprocessing existing patterns.
+  - Searching patterns in input strings.
+## Sunday
+### features
+- simplification of the Boyer-Moore algorithm;
+- uses only the bad-character shift;
+- easy to implement;
+- preprocessing phase in O(m+sigma) time and O(sigma) space complexity;
+- searching phase in O(mn) time complexity;
+- very fast in practice for short patterns and large alphabets.
+### description
+The Quick Search algorithm uses only the bad-character shift table (see chapter Boyer-Moore algorithm). After an attempt where the window is positioned on the text factor y[j .. j+m-1], the length of the shift is at least equal to one. So, the character y[j+m] is necessarily involved in the next attempt, and thus can be used for the bad-character shift of the current attempt.
+
+The bad-character shift of the present algorithm is slightly modified to take into account the last character of x as follows: for c in Sigma, qsBc[c]=min{i : 0  < i leq m and x[m-i]=c} if c occurs in x, m+1 otherwise (thanks to Darko Brljak).
+
+The preprocessing phase is in O(m+sigma) time and O(sigma) space complexity.
+
+During the searching phase the comparisons between pattern and text characters during each attempt can be done in any order. The searching phase has a quadratic worst case time complexity but it has a good practical behaviour.
+
+For instance,
+![image.png](https://upload-images.jianshu.io/upload_images/7130568-76d130ae24603d51.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
+
+In this example, t0, ..., t4 =  a b c a b is the current text window that is compared with the pattern. Its suffix a b has matched, but the comparison c-a causes a mismatch. The bad-character heuristics of the Boyer-Moore algorithm (a) uses the "bad" text character c to determine the shift distance. The Horspool algorithm (b) uses the rightmost character b of the current text window. The Sunday algorithm (c) uses the character directly right of the text window, namely d in this example. Since d does not occur in the pattern at all, the pattern can be shifted past this position.
+
+
+# Reference:
+1. Xuyun, ppt, String matching
+2. [Sunday-algorithm](http://www.inf.fh-flensburg.de/lang/algorithmen/pattern/sunday.htm)
+3. GeeksforGeeks, [KMP Algorithm](https://www.geeksforgeeks.org/kmp-algorithm-for-pattern-searching/)
--- a/string/rabin_karp.py
+++ b/string/rabin_karp.py
@ -0,0 +1,60 @@
+#coding: utf-8
+''' mbinary
+#########################################################################
+# File : rabin_karp.py
+# Author: mbinary
+# Mail: zhuheqin1@gmail.com
+# Blog: https://mbinary.coding.me
+# Github: https://github.com/mbinary
+# Created Time: 2018-12-11  00:01
+# Description: rabin-karp algorithm
+#########################################################################
+'''
+
+def isPrime(x):
+    for i in range(2,int(x**0.5)+1):
+        if x%i==0:return False
+    return True
+def getPrime(x):
+    '''return a prime which is bigger than x'''
+    for i in range(x,2*x):
+        if isPrime(i):return i
+def findAll(s,p):
+    '''s: string   p: pattern'''
+    dic={}
+    n,m = len(s),len(p)
+    d=0 #radix
+    for c in s:
+        if c not in dic:
+            dic[c]=d
+            d+=1
+    sm = 0
+    for c in p:
+        if c not in dic:return [-1]
+        sm = sm*d+dic[c]
+
+    ret = []
+    cur = 0
+    for i in range(m): cur=cur*d + dic[s[i]]
+    if cur==sm:ret.append(0)
+    tmp = n-m
+    q = getPrime(m)
+    cur = cur%q
+    sm = sm%q
+    exp = d**(m-1) % q
+    for i in range(m,n):
+        cur = ((cur-dic[s[i-m]]*exp)*d+dic[s[i]]) % q
+        if cur == sm and p==s[i-m+1:i+1]:
+            ret.append(i-m+1)
+    return ret
+
+def randStr(n=3):
+    return [randint(ord('a'),ord('z')) for i in range(n)]
+
+if __name__ =='__main__':
+    from random import randint
+    s = randStr(50)
+    p = randStr(1)
+    print(s)
+    print(p)
+    print(findAll(s,p))
--- a/string/src/compare.jpg
+++ b/string/src/compare.jpg
--- a/string/src/general.jpg
+++ b/string/src/general.jpg
--- a/string/sunday.py
+++ b/string/sunday.py
@ -28,30 +28,50 @@ def find(s,p):
        if s[ps] == p[pp]:
            ps,pp = ps+1,pp+1
        else:
-            idx = ps-pp+np
+            idx = ps+ np-pp
            if idx >=ns:return -1
            ch = s[idx]
            if ch in dic:
                ps += dic[ch]+1-pp
            else:
-                ps += np-pp
+                ps = idx+1
            pp = 0
    if pp==np:return ps-np
-    else: 
+    else:
        return -1
-def test():
-    s = [randint(78,88) for i in range(30)]
-    p = [randint(78,88) for i in range(3)]
+def findAll(s,p):
+    ns = len(s)
+    np = len(p)
+    i = 0
+    ret = []
+    while s:
+        print(s,p)
+        tmp = find(s,p)
+        if tmp==-1: break
+        ret.append(i+tmp)
+        end = tmp+np
+        i +=end
+        s = s[end:]
+    return ret
+
+
+
+def randStr(n=3):
+    return [randint(ord('a'),ord('z')) for i in range(n)]
+
+def test(n):
+    s = randStr(n)
+    p = randStr(3)
    str_s = ''.join((chr(i) for i in s))
    str_p = ''.join((chr(i) for i in p))
    n1 = find(s,p)
-    n2 = str_s.find(str_p)
+    n2 = str_s.find(str_p) # 利用已有的 str find 算法检验
    if n1!=n2:
        print(n1,n2,str_p,str_s)
        return False
    return True
 if __name__ =='__main__':
    from random import randint
-    n = 10000
-    suc = sum(test() for i in range(n))
-    print(f'test {n} times, success {suc} times')
+    n = 1000
+    suc = sum(test(n) for i in range(n))
+    print('test {n} times, success {suc} times'.format(n=n,suc=suc))