mirror of
https://github.com/heqin-zhu/algorithm.git
synced 2024-03-22 13:30:46 +08:00
String matching algorithm, permutation algorithm
This commit is contained in:
parent
3b8fa1782b
commit
166cd2737b
|
@ -84,7 +84,7 @@ def genNum(n =10,upper=10):
|
||||||
return nums.values()
|
return nums.values()
|
||||||
|
|
||||||
def buildTree(n=10,nums=None,visitor=None):
|
def buildTree(n=10,nums=None,visitor=None):
|
||||||
if nums is None or nums ==[]: nums = genNum(n)
|
#if nums is None or nums ==[]: nums = genNum(n)
|
||||||
tree = intervalTree()
|
tree = intervalTree()
|
||||||
print(f'build a red-black tree using {nums}')
|
print(f'build a red-black tree using {nums}')
|
||||||
for i in nums:
|
for i in nums:
|
||||||
|
@ -100,6 +100,7 @@ def testInsert(nums=None):
|
||||||
print('-'*5+ 'in-order visit' + '-'*5)
|
print('-'*5+ 'in-order visit' + '-'*5)
|
||||||
for i,j in enumerate(tree.sort()):
|
for i,j in enumerate(tree.sort()):
|
||||||
print(f'{i+1}: {j}')
|
print(f'{i+1}: {j}')
|
||||||
|
return tree
|
||||||
|
|
||||||
def testSuc(nums=None):
|
def testSuc(nums=None):
|
||||||
tree,nums = buildTree(nums=nums)
|
tree,nums = buildTree(nums=nums)
|
||||||
|
@ -113,10 +114,16 @@ def testDelete(nums=None):
|
||||||
print(f'deleting {i}')
|
print(f'deleting {i}')
|
||||||
tree.delete(i[0])
|
tree.delete(i[0])
|
||||||
print(tree)
|
print(tree)
|
||||||
|
return tree
|
||||||
|
|
||||||
if __name__=='__main__':
|
if __name__=='__main__':
|
||||||
lst = [(0,3),(5,8),(6,10),(26,26),(25,30),(8,9),(19,20),(15,23),(16,21),(17,19)]
|
lst = [(0,3),(5,8),(6,10),(26,26),(25,30),(8,9),(19,20),(15,23),(16,21),(17,19)]
|
||||||
lst = None
|
#lst = None
|
||||||
#testSuc(lst)
|
#testSuc(lst)
|
||||||
#testInsert(lst)
|
tree = testInsert(lst)
|
||||||
testDelete(lst)
|
#tree,_= buildTree(lst)
|
||||||
|
while 1:
|
||||||
|
a =int( input('low:'))
|
||||||
|
b =int( input('high:'))
|
||||||
|
res = tree.search(a,b)
|
||||||
|
print(res)
|
||||||
|
|
|
@ -286,6 +286,7 @@ def buildTree(n=10,nums=None,visitor=None):
|
||||||
print(f'build a red-black tree using {nums}')
|
print(f'build a red-black tree using {nums}')
|
||||||
for i in nums:
|
for i in nums:
|
||||||
rbtree.insert(node(i))
|
rbtree.insert(node(i))
|
||||||
|
print(rbtree)
|
||||||
if visitor:
|
if visitor:
|
||||||
visitor(rbtree,i)
|
visitor(rbtree,i)
|
||||||
return rbtree,nums
|
return rbtree,nums
|
||||||
|
|
|
@ -107,18 +107,18 @@ def test(f=minDistance_n2):
|
||||||
print('result: {:.2f} {} {}\n'.format(minD, p,q))
|
print('result: {:.2f} {} {}\n'.format(minD, p,q))
|
||||||
|
|
||||||
def genData(n,unique=True):
|
def genData(n,unique=True):
|
||||||
|
upper = 1000000
|
||||||
if unique:
|
if unique:
|
||||||
points = set()
|
points = set()
|
||||||
for i in range(n):
|
for i in range(n):
|
||||||
points.add(point(randint(1,1000),randint(1,1000)))
|
points.add(point(randint(1,upper),randint(1,upper)))
|
||||||
return list(points)
|
return list(points)
|
||||||
else:return [point(randint(1,1000),randint(1,1000)) for i in range(n)]
|
else:return [point(randint(1,upper),randint(1,upper)) for i in range(n)]
|
||||||
|
|
||||||
if __name__ =='__main__':
|
if __name__ =='__main__':
|
||||||
n = 10000
|
n = 1000
|
||||||
points = genData(n, unique=True)
|
points = genData(n, unique=True)
|
||||||
print('min distance of {} points'.format(n))
|
print('min distance of {} points'.format(n))
|
||||||
#print(sorted(points))
|
#print(sorted(points))
|
||||||
test(minDistance_n2)
|
test(minDistance_n2)
|
||||||
test(minDistance_nlogn)
|
test(minDistance_nlogn)
|
||||||
|
|
||||||
|
|
|
@ -1,8 +0,0 @@
|
||||||
import Vec2d (Vec2d,getVal,setVal)
|
|
||||||
|
|
||||||
|
|
||||||
lcs a b =
|
|
||||||
let m = lenghth a
|
|
||||||
n = length b
|
|
||||||
rst = []
|
|
||||||
in 1 --to do
|
|
|
@ -29,17 +29,22 @@ def lcs2(a,b):
|
||||||
m,n= len(a),len(b)
|
m,n= len(a),len(b)
|
||||||
board = [[] for i in range(n+1)]
|
board = [[] for i in range(n+1)]
|
||||||
for i in range(m):
|
for i in range(m):
|
||||||
last = []
|
upperLevel = board[0].copy()
|
||||||
for j in range(n):
|
for j in range(n):
|
||||||
|
tmp = board[j+1].copy()
|
||||||
if a[i]==b[j]:
|
if a[i]==b[j]:
|
||||||
board[j+1] =board[j]+[a[i]]
|
board[j+1] = upperLevel+[a[i]]
|
||||||
elif len(board[j+1]) < len(last):
|
elif len(board[j+1]) < len(board[j]):
|
||||||
board[j+1] = last
|
board[j+1] = board[j].copy() # copy is needed
|
||||||
last = board[j+1]
|
upperLevel = tmp
|
||||||
return board[n]
|
return board[n]
|
||||||
|
|
||||||
if __name__ =='__main__':
|
if __name__ =='__main__':
|
||||||
a="dsaffqewqfqewregqwefqwe"
|
a = 'ABCBDAB'
|
||||||
b="adsfsfs3qt5yhyh24efwq"
|
b = 'BDCABA'
|
||||||
print(lcs(a,b))
|
print('s1:',a)
|
||||||
print(lcs2(a,b))
|
print('s2:',b)
|
||||||
|
while 1:
|
||||||
|
print('lcs:',lcs2(a,b))
|
||||||
|
a = input('s1: ')
|
||||||
|
b = input('s2: ')
|
||||||
|
|
12
math/permute_back_track.py
Normal file
12
math/permute_back_track.py
Normal file
|
@ -0,0 +1,12 @@
|
||||||
|
def permute(n):
|
||||||
|
def _util(lst,i):
|
||||||
|
if i==n:print(lst)
|
||||||
|
else:
|
||||||
|
for j in range(i,n):
|
||||||
|
lst[i],lst[j]=lst[j],lst[i]
|
||||||
|
_util(lst,i+1)
|
||||||
|
lst[i],lst[j]=lst[j],lst[i]
|
||||||
|
_util([i for i in range(n)],0)
|
||||||
|
|
||||||
|
if __name__=='__main__':
|
||||||
|
permute(5)
|
|
@ -11,7 +11,7 @@ void calFac(int n)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void getArrangement(int *arr,int n,int sum)
|
void permute(int *arr,int n,int sum)
|
||||||
{
|
{
|
||||||
/*sum表示全排列由小到大排序后的名次,从0 开始计数, 由名次求出 n位的排列存储到 arr 中*/
|
/*sum表示全排列由小到大排序后的名次,从0 开始计数, 由名次求出 n位的排列存储到 arr 中*/
|
||||||
int i,j,ct=0,k, ct2;
|
int i,j,ct=0,k, ct2;
|
||||||
|
@ -36,3 +36,21 @@ void getArrangement(int *arr,int n,int sum)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void printArr(int *p,int n)
|
||||||
|
{
|
||||||
|
for(int i=0;i<n;++i)printf("%d, ",p[i]);
|
||||||
|
printf("\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
int main()
|
||||||
|
{
|
||||||
|
int n = 5,arr[n];
|
||||||
|
calFac(n);
|
||||||
|
for(int i=0;i<5;++i)arr[i]=i;
|
||||||
|
for(int i=0;i<fac[n];++i){
|
||||||
|
printArr(arr,n);
|
||||||
|
permute(arr,n,i);
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
12
math/permute_divide_and_conquer.py
Normal file
12
math/permute_divide_and_conquer.py
Normal file
|
@ -0,0 +1,12 @@
|
||||||
|
def permute(lst,n):
|
||||||
|
''' O(n!), optimal'''
|
||||||
|
if n==1:print(lst)
|
||||||
|
else:
|
||||||
|
for i in range(n):
|
||||||
|
lst[i],lst[n-1] = lst[n-1],lst[i]
|
||||||
|
permute(lst,n-1)
|
||||||
|
lst[i],lst[n-1] = lst[n-1],lst[i]
|
||||||
|
|
||||||
|
if __name__=='__main__':
|
||||||
|
n = 3
|
||||||
|
permute([i for i in range(n)],n)
|
3
math/primesLEn.hs
Normal file
3
math/primesLEn.hs
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
genPrimes 2= [2]
|
||||||
|
genPrimes n = let li = genPrimes $n-1
|
||||||
|
in if all (\x-> mod n x /=0) li then n:li else li
|
34
search/BFS_knight.hs
Normal file
34
search/BFS_knight.hs
Normal file
|
@ -0,0 +1,34 @@
|
||||||
|
{- mbinary
|
||||||
|
#########################################################################
|
||||||
|
# File : BFS_knight.hs
|
||||||
|
# Author: mbinary
|
||||||
|
# Mail: zhuheqin1@gmail.com
|
||||||
|
# Blog: https://mbinary.coding.me
|
||||||
|
# Github: https://github.com/mbinary
|
||||||
|
# Created Time: 2018-11-11 19:40
|
||||||
|
# Description:
|
||||||
|
#########################################################################
|
||||||
|
-}
|
||||||
|
{-
|
||||||
|
Given two different positions on a chess board, find the least number of moves it would take a knight to get from one to the other. The positions will be passed as two arguments in algebraic notation. For example, knight("a3", "b5") should return 1.
|
||||||
|
|
||||||
|
The knight is not allowed to move off the board. The board is 8x8.
|
||||||
|
-}
|
||||||
|
|
||||||
|
module ShortestKnightPath.Kata (knight) where
|
||||||
|
import Data.Char
|
||||||
|
import Data.List
|
||||||
|
knight :: String -> String -> Int
|
||||||
|
knight s1 s2 = let begin = axis s1
|
||||||
|
end = axis s2
|
||||||
|
notEnd = all (\tp->tp /=end)
|
||||||
|
in length . takeWhile notEnd .iterate gen $[begin]
|
||||||
|
|
||||||
|
gen li = nub. flatten $map (filter (\(a,b) ->a>0 && b>0 &&a<9&&b<9 ) . change) li
|
||||||
|
change (a,b) = [(a-1,b-2),(a-1,b+2),(a+1,b-2),(a+1,b+2),(a+2,b-1),(a+2,b+1),(a-2,b+1),(a-2,b-1)]
|
||||||
|
|
||||||
|
axis s = (ord (s!!0) -96, digitToInt (s!!1)::Int)
|
||||||
|
|
||||||
|
flatten [] = []
|
||||||
|
flatten (x:xs) = x ++ flatten xs
|
||||||
|
|
7
search/binary_search.hs
Normal file
7
search/binary_search.hs
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
search i li= binary 0 $length li -1
|
||||||
|
where binary a b= let mid = div (a+b) 2
|
||||||
|
p = li!!mid
|
||||||
|
in if a>=b then a
|
||||||
|
else if p==i then mid
|
||||||
|
else if p>i then binary a $mid-1
|
||||||
|
else binary (mid+1) b
|
104
search/schedule.py
Normal file
104
search/schedule.py
Normal file
|
@ -0,0 +1,104 @@
|
||||||
|
'''
|
||||||
|
回溯全空间搜索, 剪枝优化
|
||||||
|
|
||||||
|
|
||||||
|
设有n个任务由k个可并行工作的机器来完成,完成任务i需要时间为 。试设计一个算法找出完成这n个任务的最佳调度,使完成全部任务的时间最早。
|
||||||
|
'''
|
||||||
|
from time import time
|
||||||
|
from functools import total_ordering
|
||||||
|
@total_ordering
|
||||||
|
class record:
|
||||||
|
def __init__(self,nums=None):
|
||||||
|
if nums is None:
|
||||||
|
nums=[]
|
||||||
|
self.nums=nums
|
||||||
|
self.sum = sum(nums)
|
||||||
|
def append(self,x):
|
||||||
|
self.nums.append(x)
|
||||||
|
self.sum+=x
|
||||||
|
def pop(self):
|
||||||
|
x = self.nums.pop()
|
||||||
|
self.sum-=x
|
||||||
|
return x
|
||||||
|
def __repr__(self):
|
||||||
|
return repr(self.nums)
|
||||||
|
def __lt__(self,r):
|
||||||
|
return self.sum<r.sum
|
||||||
|
def __eq__(self,r):
|
||||||
|
return self.sum==r.sum
|
||||||
|
def tolist(self):
|
||||||
|
return self.nums.copy()
|
||||||
|
def __hash__(self):
|
||||||
|
return self.sum
|
||||||
|
def schedule(works,k):
|
||||||
|
def backtrackSearch(i,lsts):
|
||||||
|
nonlocal best,rst
|
||||||
|
if i==n:
|
||||||
|
cost = max(r.sum for r in lsts )
|
||||||
|
if best>cost:
|
||||||
|
best= cost
|
||||||
|
rst = [st.tolist() for st in lsts]
|
||||||
|
else:
|
||||||
|
for cur in set(lsts):
|
||||||
|
if best>cur.sum+works[i]:
|
||||||
|
cur.append(works[i])
|
||||||
|
backtrackSearch(i+1,lsts)
|
||||||
|
cur.pop()
|
||||||
|
def findInitial(i,lst):
|
||||||
|
nonlocal best
|
||||||
|
if i==n:
|
||||||
|
cost = max(lst)
|
||||||
|
if best>cost:best = cost
|
||||||
|
else:
|
||||||
|
mn = lst[0]
|
||||||
|
idx = 0
|
||||||
|
visited=set()
|
||||||
|
for j,cur in enumerate(lst):
|
||||||
|
if cur not in visited:
|
||||||
|
visited.add(cur)
|
||||||
|
if mn>cur:
|
||||||
|
mn = cur
|
||||||
|
idx = j
|
||||||
|
lst[idx]+=works[i]
|
||||||
|
findInitial(i+1,lst)
|
||||||
|
lst[idx]-=works[i]
|
||||||
|
|
||||||
|
|
||||||
|
n = len(works)
|
||||||
|
print()
|
||||||
|
print('machine Num:',n)
|
||||||
|
print('works :',works)
|
||||||
|
rst = None
|
||||||
|
works.sort(reverse=True) # key step
|
||||||
|
best = sum(works[:n-k+1])
|
||||||
|
t = time()
|
||||||
|
findInitial(0,[0]*k) # key step
|
||||||
|
t1 = time()-t
|
||||||
|
print('init solution: {} cost time {:.6f}s'.format(best,t1))
|
||||||
|
t = time()
|
||||||
|
backtrackSearch(0,[record() for i in range(k)])
|
||||||
|
t2 = time()-t
|
||||||
|
print('final solution: {} cost time {:.6f}s'.format(best,t2))
|
||||||
|
print('schedule plan:',rst)
|
||||||
|
return best,rst
|
||||||
|
|
||||||
|
if __name__=='__main__':
|
||||||
|
from random import randint
|
||||||
|
schedule([47,20,28,44,21,45,30,39,28,33],3)
|
||||||
|
schedule([98,84,50,23,32,99,22,76,72,61,81,39,76,54,37],5)
|
||||||
|
schedule([39,39,23,45,100,69,21,81,39,55,20,86,34,53,58,99,36,45,46],8)
|
||||||
|
|
||||||
|
'''
|
||||||
|
machine Num: 19
|
||||||
|
works : [39, 39, 23, 45, 100, 69, 21, 81, 39, 55, 20, 86, 34, 53, 58, 99, 36, 45, 46]
|
||||||
|
|
||||||
|
works 经过逆序排序
|
||||||
|
init solution: 135 cost time 0.000196s
|
||||||
|
final solution: 126 cost time 0.022922s
|
||||||
|
schedule plan: [[100, 21], [99, 23], [86, 39], [81, 45], [69, 53], [58, 45, 20], [55, 36, 34], [46, 39, 39]]
|
||||||
|
|
||||||
|
works 没有经过排序
|
||||||
|
init solution: 168 cost time 0.000179s
|
||||||
|
final solution: 126 cost time 10.646307s
|
||||||
|
schedule plan: [[39, 86], [39, 34, 53], [23, 99], [45, 39, 36], [100, 20], [69, 55], [21, 58, 46], [81, 45]]
|
||||||
|
'''
|
58
string/KMP.py
Normal file
58
string/KMP.py
Normal file
|
@ -0,0 +1,58 @@
|
||||||
|
#coding: utf-8
|
||||||
|
''' mbinary
|
||||||
|
#########################################################################
|
||||||
|
# File : KMP.py
|
||||||
|
# Author: mbinary
|
||||||
|
# Mail: zhuheqin1@gmail.com
|
||||||
|
# Blog: https://mbinary.coding.me
|
||||||
|
# Github: https://github.com/mbinary
|
||||||
|
# Created Time: 2018-12-11 14:02
|
||||||
|
# Description:
|
||||||
|
#########################################################################
|
||||||
|
'''
|
||||||
|
|
||||||
|
def getPrefixFunc(s):
|
||||||
|
'''return the list of prefix function of s'''
|
||||||
|
length = 0
|
||||||
|
i = 1
|
||||||
|
n = len(s)
|
||||||
|
ret = [0]
|
||||||
|
while i<n:
|
||||||
|
if s[i]==s[length]:
|
||||||
|
length +=1
|
||||||
|
ret.append(length)
|
||||||
|
i+=1
|
||||||
|
else:
|
||||||
|
if length==0:
|
||||||
|
ret.append(0)
|
||||||
|
i+=1
|
||||||
|
else:
|
||||||
|
length = ret[length-1]
|
||||||
|
return ret
|
||||||
|
|
||||||
|
def findAll(s,p):
|
||||||
|
pre = getPrefixFunc(p)
|
||||||
|
i = j =0
|
||||||
|
n,m = len(s),len(p)
|
||||||
|
ret = []
|
||||||
|
while i<n:
|
||||||
|
if s[i]==p[j]:
|
||||||
|
i+=1
|
||||||
|
j+=1
|
||||||
|
if j==m:
|
||||||
|
ret.append(i-j)
|
||||||
|
j=pre[j-1]
|
||||||
|
else:
|
||||||
|
if j==0: i+=1
|
||||||
|
else: j = pre[j-1]
|
||||||
|
return ret
|
||||||
|
def randStr(n=3):
|
||||||
|
return [randint(ord('a'),ord('z')) for i in range(n)]
|
||||||
|
|
||||||
|
if __name__ =='__main__':
|
||||||
|
from random import randint
|
||||||
|
s = randStr(50)
|
||||||
|
p = randStr(1)
|
||||||
|
print(s)
|
||||||
|
print(p)
|
||||||
|
print(findAll(s,p))
|
110
string/README.md
Normal file
110
string/README.md
Normal file
|
@ -0,0 +1,110 @@
|
||||||
|
# String Matching algorithm
|
||||||
|
|
||||||
|
![](https://upload-images.jianshu.io/upload_images/7130568-e10dc137e9083a0e.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
|
||||||
|
|
||||||
|
## Rabin-Karp
|
||||||
|
We can view a string of k characters (digits) as a length-k decimal number. E.g., the string “31425” corresponds to the decimal number 31,425.
|
||||||
|
- Given a pattern P [1..m], let p denote the corresponding decimal value.
|
||||||
|
- Given a text T [1..n], let $t_s$ denote the decimal value of the length-m substring T [(s+1)..(s+m)] for s=0,1,…,(n-m).
|
||||||
|
- let `d` be the radix of num, thus $d = len(set(s))$
|
||||||
|
- $t_s$ = p iff T [(s+1)..(s+m)] = P [1..m].
|
||||||
|
- p can be computed in O(m) time. p = P[m] + d\*(P[m-1] + d\*(P[m-2]+…)).
|
||||||
|
- t0 can similarly be computed in O(m) time.
|
||||||
|
- Other $t_1,\ldots,t_{n-m}$ can be computed in O(n-m) time since $t_{s+1} can be computed from ts in constant time.
|
||||||
|
Namely,
|
||||||
|
|
||||||
|
$$
|
||||||
|
t_{s+1} = d*(t_s-d^{m-1} * T[s+1])+T[s+m+1]
|
||||||
|
$$
|
||||||
|
However, it's no need to calculate $t_{s+1}$ directly. We can use modulus operation to reduce the work of caculation.
|
||||||
|
|
||||||
|
We choose a small prime number. Eg 13 for radix( noted as d) 10.
|
||||||
|
Generally, d\*q should fit within one computer word.
|
||||||
|
|
||||||
|
We firstly caculate t0 mod q.
|
||||||
|
Then, for every $t_i (i>1)$
|
||||||
|
assume
|
||||||
|
$$
|
||||||
|
t_{i-1} = T[i+m-1] + 10*T[i+m-2]+\ldots+10^{m-1}*T[i-1]
|
||||||
|
$$
|
||||||
|
denote $ d' = d^{m-1}\ mod\ q$
|
||||||
|
thus,
|
||||||
|
$$
|
||||||
|
\begin{aligned}
|
||||||
|
t_i &= (t_{i-1} - d^{m-1}*T[i-1]) * d + T[i+m]\\
|
||||||
|
&\equiv (t_{i-1} - d^{m-1}*T[i-1]) * d + T[i+m] (mod\ q)\\
|
||||||
|
&\equiv (t_{i-1}- ( d^{m-1} mod \ q) *T[i-1]) * d + T[i+m] (mod\ q)\\
|
||||||
|
&\equiv (t_{i-1}- d'*T[i-1]) * d + T[i+m] (mod\ q)
|
||||||
|
\end{aligned}
|
||||||
|
$$
|
||||||
|
|
||||||
|
So we can compare the modular value of each ti with p's.
|
||||||
|
Only if they are the same, then we compare the origin chracter, namely $T[i],T[i+1],\ldots,T[i+m-1]$ and the pattern.
|
||||||
|
Gernerally, this algorithm's time approximation is O(n+m), and the worst case is O((n-m+1)\*m)
|
||||||
|
|
||||||
|
**Problem: this is assuming p and ts are small numbers. They may be too large to work with easily.**
|
||||||
|
|
||||||
|
## FSM
|
||||||
|
A FSM can be represented as (Q,q0,A,S,C), where
|
||||||
|
- Q is the set of all states
|
||||||
|
- q0 is the start state
|
||||||
|
- $A\in Q$ is a set of accepting states.
|
||||||
|
- S is a finite input alphabet.
|
||||||
|
- C is the set of transition functions: namely $q_j = c(s,q_i)$.
|
||||||
|
|
||||||
|
Given a pattern string S, we can build a FSM for string matching.
|
||||||
|
Assume S has m chars, and there should be m+1 states. One is for the begin state, and the others are for matching state of each position of S.
|
||||||
|
|
||||||
|
Once we have built the FSM, we can run it on any input string.
|
||||||
|
## KMP
|
||||||
|
>Knuth-Morris-Pratt method
|
||||||
|
|
||||||
|
The idea is inspired by FSM. We can avoid computing the transition functions. Instead, we compute a prefix functi`Next` on P in O(m) time, and Next has only m entries.
|
||||||
|
> Prefix funtion stores info about how the pattern matches against shifts of itself.
|
||||||
|
|
||||||
|
- String w is a prefix of string x, if x=wy for some string y
|
||||||
|
- String w is a suffix of string x, if x=yw for some string y
|
||||||
|
- The k-character prefix of the pattern P [1..m] denoted by Pk.
|
||||||
|
- Given that pattern prefix P [1..q] matches text characters T [(s+1)..(s+q)], what is the least shift s'> s such that P [1..k] = T [(s'+1)..(s'+k)] where s'+k=s+q?
|
||||||
|
- At the new shift s', no need to compare the first k characters of P with corresponding characters of T.
|
||||||
|
Method: For prefix pi, find the longest proper prefix of pi that is also a suffix of pi.
|
||||||
|
next[q] = max{k|k\<q and pk is a suffix of pq}
|
||||||
|
|
||||||
|
For example: p = ababaca, for p5 = ababa, Next[5] = 3. Namely p3=aba is the longest prefix of p that is also a suffix of p5.
|
||||||
|
|
||||||
|
Time approximation: finding prefix function `next` take O(m), matching takes O(m+n)
|
||||||
|
|
||||||
|
## Boyer-Moore
|
||||||
|
- The longer the pattern is, the faster it works.
|
||||||
|
- Starts from the end of pattern, while KMP starts from the beginning.
|
||||||
|
- Works best for character string, while KMP works best for binary string.
|
||||||
|
- KMP and Boyer-Moore
|
||||||
|
- Preprocessing existing patterns.
|
||||||
|
- Searching patterns in input strings.
|
||||||
|
## Sunday
|
||||||
|
### features
|
||||||
|
- simplification of the Boyer-Moore algorithm;
|
||||||
|
- uses only the bad-character shift;
|
||||||
|
- easy to implement;
|
||||||
|
- preprocessing phase in O(m+sigma) time and O(sigma) space complexity;
|
||||||
|
- searching phase in O(mn) time complexity;
|
||||||
|
- very fast in practice for short patterns and large alphabets.
|
||||||
|
### description
|
||||||
|
The Quick Search algorithm uses only the bad-character shift table (see chapter Boyer-Moore algorithm). After an attempt where the window is positioned on the text factor y[j .. j+m-1], the length of the shift is at least equal to one. So, the character y[j+m] is necessarily involved in the next attempt, and thus can be used for the bad-character shift of the current attempt.
|
||||||
|
|
||||||
|
The bad-character shift of the present algorithm is slightly modified to take into account the last character of x as follows: for c in Sigma, qsBc[c]=min{i : 0 < i leq m and x[m-i]=c} if c occurs in x, m+1 otherwise (thanks to Darko Brljak).
|
||||||
|
|
||||||
|
The preprocessing phase is in O(m+sigma) time and O(sigma) space complexity.
|
||||||
|
|
||||||
|
During the searching phase the comparisons between pattern and text characters during each attempt can be done in any order. The searching phase has a quadratic worst case time complexity but it has a good practical behaviour.
|
||||||
|
|
||||||
|
For instance,
|
||||||
|
![image.png](https://upload-images.jianshu.io/upload_images/7130568-76d130ae24603d51.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
|
||||||
|
|
||||||
|
In this example, t0, ..., t4 = a b c a b is the current text window that is compared with the pattern. Its suffix a b has matched, but the comparison c-a causes a mismatch. The bad-character heuristics of the Boyer-Moore algorithm (a) uses the "bad" text character c to determine the shift distance. The Horspool algorithm (b) uses the rightmost character b of the current text window. The Sunday algorithm (c) uses the character directly right of the text window, namely d in this example. Since d does not occur in the pattern at all, the pattern can be shifted past this position.
|
||||||
|
|
||||||
|
|
||||||
|
# Reference:
|
||||||
|
1. Xuyun, ppt, String matching
|
||||||
|
2. [Sunday-algorithm](http://www.inf.fh-flensburg.de/lang/algorithmen/pattern/sunday.htm)
|
||||||
|
3. GeeksforGeeks, [KMP Algorithm](https://www.geeksforgeeks.org/kmp-algorithm-for-pattern-searching/)
|
60
string/rabin_karp.py
Normal file
60
string/rabin_karp.py
Normal file
|
@ -0,0 +1,60 @@
|
||||||
|
#coding: utf-8
|
||||||
|
''' mbinary
|
||||||
|
#########################################################################
|
||||||
|
# File : rabin_karp.py
|
||||||
|
# Author: mbinary
|
||||||
|
# Mail: zhuheqin1@gmail.com
|
||||||
|
# Blog: https://mbinary.coding.me
|
||||||
|
# Github: https://github.com/mbinary
|
||||||
|
# Created Time: 2018-12-11 00:01
|
||||||
|
# Description: rabin-karp algorithm
|
||||||
|
#########################################################################
|
||||||
|
'''
|
||||||
|
|
||||||
|
def isPrime(x):
|
||||||
|
for i in range(2,int(x**0.5)+1):
|
||||||
|
if x%i==0:return False
|
||||||
|
return True
|
||||||
|
def getPrime(x):
|
||||||
|
'''return a prime which is bigger than x'''
|
||||||
|
for i in range(x,2*x):
|
||||||
|
if isPrime(i):return i
|
||||||
|
def findAll(s,p):
|
||||||
|
'''s: string p: pattern'''
|
||||||
|
dic={}
|
||||||
|
n,m = len(s),len(p)
|
||||||
|
d=0 #radix
|
||||||
|
for c in s:
|
||||||
|
if c not in dic:
|
||||||
|
dic[c]=d
|
||||||
|
d+=1
|
||||||
|
sm = 0
|
||||||
|
for c in p:
|
||||||
|
if c not in dic:return [-1]
|
||||||
|
sm = sm*d+dic[c]
|
||||||
|
|
||||||
|
ret = []
|
||||||
|
cur = 0
|
||||||
|
for i in range(m): cur=cur*d + dic[s[i]]
|
||||||
|
if cur==sm:ret.append(0)
|
||||||
|
tmp = n-m
|
||||||
|
q = getPrime(m)
|
||||||
|
cur = cur%q
|
||||||
|
sm = sm%q
|
||||||
|
exp = d**(m-1) % q
|
||||||
|
for i in range(m,n):
|
||||||
|
cur = ((cur-dic[s[i-m]]*exp)*d+dic[s[i]]) % q
|
||||||
|
if cur == sm and p==s[i-m+1:i+1]:
|
||||||
|
ret.append(i-m+1)
|
||||||
|
return ret
|
||||||
|
|
||||||
|
def randStr(n=3):
|
||||||
|
return [randint(ord('a'),ord('z')) for i in range(n)]
|
||||||
|
|
||||||
|
if __name__ =='__main__':
|
||||||
|
from random import randint
|
||||||
|
s = randStr(50)
|
||||||
|
p = randStr(1)
|
||||||
|
print(s)
|
||||||
|
print(p)
|
||||||
|
print(findAll(s,p))
|
BIN
string/src/compare.jpg
Normal file
BIN
string/src/compare.jpg
Normal file
Binary file not shown.
After Width: | Height: | Size: 12 KiB |
BIN
string/src/general.jpg
Normal file
BIN
string/src/general.jpg
Normal file
Binary file not shown.
After Width: | Height: | Size: 20 KiB |
|
@ -28,30 +28,50 @@ def find(s,p):
|
||||||
if s[ps] == p[pp]:
|
if s[ps] == p[pp]:
|
||||||
ps,pp = ps+1,pp+1
|
ps,pp = ps+1,pp+1
|
||||||
else:
|
else:
|
||||||
idx = ps-pp+np
|
idx = ps+ np-pp
|
||||||
if idx >=ns:return -1
|
if idx >=ns:return -1
|
||||||
ch = s[idx]
|
ch = s[idx]
|
||||||
if ch in dic:
|
if ch in dic:
|
||||||
ps += dic[ch]+1-pp
|
ps += dic[ch]+1-pp
|
||||||
else:
|
else:
|
||||||
ps += np-pp
|
ps = idx+1
|
||||||
pp = 0
|
pp = 0
|
||||||
if pp==np:return ps-np
|
if pp==np:return ps-np
|
||||||
else:
|
else:
|
||||||
return -1
|
return -1
|
||||||
def test():
|
def findAll(s,p):
|
||||||
s = [randint(78,88) for i in range(30)]
|
ns = len(s)
|
||||||
p = [randint(78,88) for i in range(3)]
|
np = len(p)
|
||||||
|
i = 0
|
||||||
|
ret = []
|
||||||
|
while s:
|
||||||
|
print(s,p)
|
||||||
|
tmp = find(s,p)
|
||||||
|
if tmp==-1: break
|
||||||
|
ret.append(i+tmp)
|
||||||
|
end = tmp+np
|
||||||
|
i +=end
|
||||||
|
s = s[end:]
|
||||||
|
return ret
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def randStr(n=3):
|
||||||
|
return [randint(ord('a'),ord('z')) for i in range(n)]
|
||||||
|
|
||||||
|
def test(n):
|
||||||
|
s = randStr(n)
|
||||||
|
p = randStr(3)
|
||||||
str_s = ''.join((chr(i) for i in s))
|
str_s = ''.join((chr(i) for i in s))
|
||||||
str_p = ''.join((chr(i) for i in p))
|
str_p = ''.join((chr(i) for i in p))
|
||||||
n1 = find(s,p)
|
n1 = find(s,p)
|
||||||
n2 = str_s.find(str_p)
|
n2 = str_s.find(str_p) # 利用已有的 str find 算法检验
|
||||||
if n1!=n2:
|
if n1!=n2:
|
||||||
print(n1,n2,str_p,str_s)
|
print(n1,n2,str_p,str_s)
|
||||||
return False
|
return False
|
||||||
return True
|
return True
|
||||||
if __name__ =='__main__':
|
if __name__ =='__main__':
|
||||||
from random import randint
|
from random import randint
|
||||||
n = 10000
|
n = 1000
|
||||||
suc = sum(test() for i in range(n))
|
suc = sum(test(n) for i in range(n))
|
||||||
print(f'test {n} times, success {suc} times')
|
print('test {n} times, success {suc} times'.format(n=n,suc=suc))
|
||||||
|
|
Loading…
Reference in New Issue
Block a user