String matching algorithm, permutation algorithm

This commit is contained in:
mbinary 2018-12-11 15:28:05 +08:00
parent 3b8fa1782b
commit 166cd2737b
19 changed files with 479 additions and 36 deletions

View File

@ -84,7 +84,7 @@ def genNum(n =10,upper=10):
return nums.values()
def buildTree(n=10,nums=None,visitor=None):
if nums is None or nums ==[]: nums = genNum(n)
#if nums is None or nums ==[]: nums = genNum(n)
tree = intervalTree()
print(f'build a red-black tree using {nums}')
for i in nums:
@ -100,6 +100,7 @@ def testInsert(nums=None):
print('-'*5+ 'in-order visit' + '-'*5)
for i,j in enumerate(tree.sort()):
print(f'{i+1}: {j}')
return tree
def testSuc(nums=None):
tree,nums = buildTree(nums=nums)
@ -113,10 +114,16 @@ def testDelete(nums=None):
print(f'deleting {i}')
tree.delete(i[0])
print(tree)
return tree
if __name__=='__main__':
lst = [(0,3),(5,8),(6,10),(26,26),(25,30),(8,9),(19,20),(15,23),(16,21),(17,19)]
lst = None
#lst = None
#testSuc(lst)
#testInsert(lst)
testDelete(lst)
tree = testInsert(lst)
#tree,_= buildTree(lst)
while 1:
a =int( input('low:'))
b =int( input('high:'))
res = tree.search(a,b)
print(res)

View File

@ -286,6 +286,7 @@ def buildTree(n=10,nums=None,visitor=None):
print(f'build a red-black tree using {nums}')
for i in nums:
rbtree.insert(node(i))
print(rbtree)
if visitor:
visitor(rbtree,i)
return rbtree,nums

View File

@ -107,18 +107,18 @@ def test(f=minDistance_n2):
print('result: {:.2f} {} {}\n'.format(minD, p,q))
def genData(n,unique=True):
upper = 1000000
if unique:
points = set()
for i in range(n):
points.add(point(randint(1,1000),randint(1,1000)))
points.add(point(randint(1,upper),randint(1,upper)))
return list(points)
else:return [point(randint(1,1000),randint(1,1000)) for i in range(n)]
else:return [point(randint(1,upper),randint(1,upper)) for i in range(n)]
if __name__ =='__main__':
n = 10000
n = 1000
points = genData(n, unique=True)
print('min distance of {} points'.format(n))
#print(sorted(points))
test(minDistance_n2)
test(minDistance_nlogn)

View File

@ -1,8 +0,0 @@
import Vec2d (Vec2d,getVal,setVal)
lcs a b =
let m = lenghth a
n = length b
rst = []
in 1 --to do

View File

@ -29,17 +29,22 @@ def lcs2(a,b):
m,n= len(a),len(b)
board = [[] for i in range(n+1)]
for i in range(m):
last = []
upperLevel = board[0].copy()
for j in range(n):
tmp = board[j+1].copy()
if a[i]==b[j]:
board[j+1] =board[j]+[a[i]]
elif len(board[j+1]) < len(last):
board[j+1] = last
last = board[j+1]
board[j+1] = upperLevel+[a[i]]
elif len(board[j+1]) < len(board[j]):
board[j+1] = board[j].copy() # copy is needed
upperLevel = tmp
return board[n]
if __name__ =='__main__':
a="dsaffqewqfqewregqwefqwe"
b="adsfsfs3qt5yhyh24efwq"
print(lcs(a,b))
print(lcs2(a,b))
a = 'ABCBDAB'
b = 'BDCABA'
print('s1:',a)
print('s2:',b)
while 1:
print('lcs:',lcs2(a,b))
a = input('s1: ')
b = input('s2: ')

View File

@ -0,0 +1,12 @@
def permute(n):
def _util(lst,i):
if i==n:print(lst)
else:
for j in range(i,n):
lst[i],lst[j]=lst[j],lst[i]
_util(lst,i+1)
lst[i],lst[j]=lst[j],lst[i]
_util([i for i in range(n)],0)
if __name__=='__main__':
permute(5)

View File

@ -11,7 +11,7 @@ void calFac(int n)
}
}
void getArrangement(int *arr,int n,int sum)
void permute(int *arr,int n,int sum)
{
/*sum表示全排列由小到大排序后的名次,从0 开始计数, 由名次求出 n位的排列存储到 arr 中*/
int i,j,ct=0,k, ct2;
@ -36,3 +36,21 @@ void getArrangement(int *arr,int n,int sum)
}
}
void printArr(int *p,int n)
{
for(int i=0;i<n;++i)printf("%d, ",p[i]);
printf("\n");
}
int main()
{
int n = 5,arr[n];
calFac(n);
for(int i=0;i<5;++i)arr[i]=i;
for(int i=0;i<fac[n];++i){
printArr(arr,n);
permute(arr,n,i);
}
return 0;
}

View File

@ -0,0 +1,12 @@
def permute(lst,n):
''' O(n!), optimal'''
if n==1:print(lst)
else:
for i in range(n):
lst[i],lst[n-1] = lst[n-1],lst[i]
permute(lst,n-1)
lst[i],lst[n-1] = lst[n-1],lst[i]
if __name__=='__main__':
n = 3
permute([i for i in range(n)],n)

3
math/primesLEn.hs Normal file
View File

@ -0,0 +1,3 @@
genPrimes 2= [2]
genPrimes n = let li = genPrimes $n-1
in if all (\x-> mod n x /=0) li then n:li else li

34
search/BFS_knight.hs Normal file
View File

@ -0,0 +1,34 @@
{- mbinary
#########################################################################
# File : BFS_knight.hs
# Author: mbinary
# Mail: zhuheqin1@gmail.com
# Blog: https://mbinary.coding.me
# Github: https://github.com/mbinary
# Created Time: 2018-11-11 19:40
# Description:
#########################################################################
-}
{-
Given two different positions on a chess board, find the least number of moves it would take a knight to get from one to the other. The positions will be passed as two arguments in algebraic notation. For example, knight("a3", "b5") should return 1.
The knight is not allowed to move off the board. The board is 8x8.
-}
module ShortestKnightPath.Kata (knight) where
import Data.Char
import Data.List
knight :: String -> String -> Int
knight s1 s2 = let begin = axis s1
end = axis s2
notEnd = all (\tp->tp /=end)
in length . takeWhile notEnd .iterate gen $[begin]
gen li = nub. flatten $map (filter (\(a,b) ->a>0 && b>0 &&a<9&&b<9 ) . change) li
change (a,b) = [(a-1,b-2),(a-1,b+2),(a+1,b-2),(a+1,b+2),(a+2,b-1),(a+2,b+1),(a-2,b+1),(a-2,b-1)]
axis s = (ord (s!!0) -96, digitToInt (s!!1)::Int)
flatten [] = []
flatten (x:xs) = x ++ flatten xs

7
search/binary_search.hs Normal file
View File

@ -0,0 +1,7 @@
search i li= binary 0 $length li -1
where binary a b= let mid = div (a+b) 2
p = li!!mid
in if a>=b then a
else if p==i then mid
else if p>i then binary a $mid-1
else binary (mid+1) b

104
search/schedule.py Normal file
View File

@ -0,0 +1,104 @@
'''
回溯全空间搜索, 剪枝优化
设有n个任务由k个可并行工作的机器来完成完成任务i需要时间为 试设计一个算法找出完成这n个任务的最佳调度使完成全部任务的时间最早
'''
from time import time
from functools import total_ordering
@total_ordering
class record:
def __init__(self,nums=None):
if nums is None:
nums=[]
self.nums=nums
self.sum = sum(nums)
def append(self,x):
self.nums.append(x)
self.sum+=x
def pop(self):
x = self.nums.pop()
self.sum-=x
return x
def __repr__(self):
return repr(self.nums)
def __lt__(self,r):
return self.sum<r.sum
def __eq__(self,r):
return self.sum==r.sum
def tolist(self):
return self.nums.copy()
def __hash__(self):
return self.sum
def schedule(works,k):
def backtrackSearch(i,lsts):
nonlocal best,rst
if i==n:
cost = max(r.sum for r in lsts )
if best>cost:
best= cost
rst = [st.tolist() for st in lsts]
else:
for cur in set(lsts):
if best>cur.sum+works[i]:
cur.append(works[i])
backtrackSearch(i+1,lsts)
cur.pop()
def findInitial(i,lst):
nonlocal best
if i==n:
cost = max(lst)
if best>cost:best = cost
else:
mn = lst[0]
idx = 0
visited=set()
for j,cur in enumerate(lst):
if cur not in visited:
visited.add(cur)
if mn>cur:
mn = cur
idx = j
lst[idx]+=works[i]
findInitial(i+1,lst)
lst[idx]-=works[i]
n = len(works)
print()
print('machine Num:',n)
print('works :',works)
rst = None
works.sort(reverse=True) # key step
best = sum(works[:n-k+1])
t = time()
findInitial(0,[0]*k) # key step
t1 = time()-t
print('init solution: {} cost time {:.6f}s'.format(best,t1))
t = time()
backtrackSearch(0,[record() for i in range(k)])
t2 = time()-t
print('final solution: {} cost time {:.6f}s'.format(best,t2))
print('schedule plan:',rst)
return best,rst
if __name__=='__main__':
from random import randint
schedule([47,20,28,44,21,45,30,39,28,33],3)
schedule([98,84,50,23,32,99,22,76,72,61,81,39,76,54,37],5)
schedule([39,39,23,45,100,69,21,81,39,55,20,86,34,53,58,99,36,45,46],8)
'''
machine Num: 19
works : [39, 39, 23, 45, 100, 69, 21, 81, 39, 55, 20, 86, 34, 53, 58, 99, 36, 45, 46]
works 经过逆序排序
init solution: 135 cost time 0.000196s
final solution: 126 cost time 0.022922s
schedule plan: [[100, 21], [99, 23], [86, 39], [81, 45], [69, 53], [58, 45, 20], [55, 36, 34], [46, 39, 39]]
works 没有经过排序
init solution: 168 cost time 0.000179s
final solution: 126 cost time 10.646307s
schedule plan: [[39, 86], [39, 34, 53], [23, 99], [45, 39, 36], [100, 20], [69, 55], [21, 58, 46], [81, 45]]
'''

58
string/KMP.py Normal file
View File

@ -0,0 +1,58 @@
#coding: utf-8
''' mbinary
#########################################################################
# File : KMP.py
# Author: mbinary
# Mail: zhuheqin1@gmail.com
# Blog: https://mbinary.coding.me
# Github: https://github.com/mbinary
# Created Time: 2018-12-11 14:02
# Description:
#########################################################################
'''
def getPrefixFunc(s):
'''return the list of prefix function of s'''
length = 0
i = 1
n = len(s)
ret = [0]
while i<n:
if s[i]==s[length]:
length +=1
ret.append(length)
i+=1
else:
if length==0:
ret.append(0)
i+=1
else:
length = ret[length-1]
return ret
def findAll(s,p):
pre = getPrefixFunc(p)
i = j =0
n,m = len(s),len(p)
ret = []
while i<n:
if s[i]==p[j]:
i+=1
j+=1
if j==m:
ret.append(i-j)
j=pre[j-1]
else:
if j==0: i+=1
else: j = pre[j-1]
return ret
def randStr(n=3):
return [randint(ord('a'),ord('z')) for i in range(n)]
if __name__ =='__main__':
from random import randint
s = randStr(50)
p = randStr(1)
print(s)
print(p)
print(findAll(s,p))

110
string/README.md Normal file
View File

@ -0,0 +1,110 @@
# String Matching algorithm
![](https://upload-images.jianshu.io/upload_images/7130568-e10dc137e9083a0e.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
## Rabin-Karp
We can view a string of k characters (digits) as a length-k decimal number. E.g., the string “31425” corresponds to the decimal number 31,425.
- Given a pattern P [1..m], let p denote the corresponding decimal value.
- Given a text T [1..n], let $t_s$ denote the decimal value of the length-m substring T [(s+1)..(s+m)] for s=0,1,…,(n-m).
- let `d` be the radix of num, thus $d = len(set(s))$
- $t_s$ = p iff T [(s+1)..(s+m)] = P [1..m].
- p can be computed in O(m) time. p = P[m] + d\*(P[m-1] + d\*(P[m-2]+…)).
- t0 can similarly be computed in O(m) time.
- Other $t_1,\ldots,t_{n-m}$ can be computed in O(n-m) time since $t_{s+1} can be computed from ts in constant time.
Namely,
$$
t_{s+1} = d*(t_s-d^{m-1} * T[s+1])+T[s+m+1]
$$
However, it's no need to calculate $t_{s+1}$ directly. We can use modulus operation to reduce the work of caculation.
We choose a small prime number. Eg 13 for radix( noted as d) 10.
Generally, d\*q should fit within one computer word.
We firstly caculate t0 mod q.
Then, for every $t_i (i>1)$
assume
$$
t_{i-1} = T[i+m-1] + 10*T[i+m-2]+\ldots+10^{m-1}*T[i-1]
$$
denote $ d' = d^{m-1}\ mod\ q$
thus,
$$
\begin{aligned}
t_i &= (t_{i-1} - d^{m-1}*T[i-1]) * d + T[i+m]\\
&\equiv (t_{i-1} - d^{m-1}*T[i-1]) * d + T[i+m] (mod\ q)\\
&\equiv (t_{i-1}- ( d^{m-1} mod \ q) *T[i-1]) * d + T[i+m] (mod\ q)\\
&\equiv (t_{i-1}- d'*T[i-1]) * d + T[i+m] (mod\ q)
\end{aligned}
$$
So we can compare the modular value of each ti with p's.
Only if they are the same, then we compare the origin chracter, namely $T[i],T[i+1],\ldots,T[i+m-1]$ and the pattern.
Gernerally, this algorithm's time approximation is O(n+m), and the worst case is O((n-m+1)\*m)
**Problem: this is assuming p and ts are small numbers. They may be too large to work with easily.**
## FSM
A FSM can be represented as (Q,q0,A,S,C), where
- Q is the set of all states
- q0 is the start state
- $A\in Q$ is a set of accepting states.
- S is a finite input alphabet.
- C is the set of transition functions: namely $q_j = c(s,q_i)$.
Given a pattern string S, we can build a FSM for string matching.
Assume S has m chars, and there should be m+1 states. One is for the begin state, and the others are for matching state of each position of S.
Once we have built the FSM, we can run it on any input string.
## KMP
>Knuth-Morris-Pratt method
The idea is inspired by FSM. We can avoid computing the transition functions. Instead, we compute a prefix functi`Next` on P in O(m) time, and Next has only m entries.
> Prefix funtion stores info about how the pattern matches against shifts of itself.
- String w is a prefix of string x, if x=wy for some string y
- String w is a suffix of string x, if x=yw for some string y
- The k-character prefix of the pattern P [1..m] denoted by Pk.
- Given that pattern prefix P [1..q] matches text characters T [(s+1)..(s+q)], what is the least shift s'> s such that P [1..k] = T [(s'+1)..(s'+k)] where s'+k=s+q?
- At the new shift s', no need to compare the first k characters of P with corresponding characters of T.
Method: For prefix pi, find the longest proper prefix of pi that is also a suffix of pi.
next[q] = max{k|k\<q and pk is a suffix of pq}
For example: p = ababaca, for p5 = ababa, Next[5] = 3. Namely p3=aba is the longest prefix of p that is also a suffix of p5.
Time approximation: finding prefix function `next` take O(m), matching takes O(m+n)
## Boyer-Moore
- The longer the pattern is, the faster it works.
- Starts from the end of pattern, while KMP starts from the beginning.
- Works best for character string, while KMP works best for binary string.
- KMP and Boyer-Moore
- Preprocessing existing patterns.
- Searching patterns in input strings.
## Sunday
### features
- simplification of the Boyer-Moore algorithm;
- uses only the bad-character shift;
- easy to implement;
- preprocessing phase in O(m+sigma) time and O(sigma) space complexity;
- searching phase in O(mn) time complexity;
- very fast in practice for short patterns and large alphabets.
### description
The Quick Search algorithm uses only the bad-character shift table (see chapter Boyer-Moore algorithm). After an attempt where the window is positioned on the text factor y[j .. j+m-1], the length of the shift is at least equal to one. So, the character y[j+m] is necessarily involved in the next attempt, and thus can be used for the bad-character shift of the current attempt.
The bad-character shift of the present algorithm is slightly modified to take into account the last character of x as follows: for c in Sigma, qsBc[c]=min{i : 0 < i leq m and x[m-i]=c} if c occurs in x, m+1 otherwise (thanks to Darko Brljak).
The preprocessing phase is in O(m+sigma) time and O(sigma) space complexity.
During the searching phase the comparisons between pattern and text characters during each attempt can be done in any order. The searching phase has a quadratic worst case time complexity but it has a good practical behaviour.
For instance,
![image.png](https://upload-images.jianshu.io/upload_images/7130568-76d130ae24603d51.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
In this example, t0, ..., t4 = a b c a b is the current text window that is compared with the pattern. Its suffix a b has matched, but the comparison c-a causes a mismatch. The bad-character heuristics of the Boyer-Moore algorithm (a) uses the "bad" text character c to determine the shift distance. The Horspool algorithm (b) uses the rightmost character b of the current text window. The Sunday algorithm (c) uses the character directly right of the text window, namely d in this example. Since d does not occur in the pattern at all, the pattern can be shifted past this position.
# Reference:
1. Xuyun, ppt, String matching
2. [Sunday-algorithm](http://www.inf.fh-flensburg.de/lang/algorithmen/pattern/sunday.htm)
3. GeeksforGeeks, [KMP Algorithm](https://www.geeksforgeeks.org/kmp-algorithm-for-pattern-searching/)

60
string/rabin_karp.py Normal file
View File

@ -0,0 +1,60 @@
#coding: utf-8
''' mbinary
#########################################################################
# File : rabin_karp.py
# Author: mbinary
# Mail: zhuheqin1@gmail.com
# Blog: https://mbinary.coding.me
# Github: https://github.com/mbinary
# Created Time: 2018-12-11 00:01
# Description: rabin-karp algorithm
#########################################################################
'''
def isPrime(x):
for i in range(2,int(x**0.5)+1):
if x%i==0:return False
return True
def getPrime(x):
'''return a prime which is bigger than x'''
for i in range(x,2*x):
if isPrime(i):return i
def findAll(s,p):
'''s: string p: pattern'''
dic={}
n,m = len(s),len(p)
d=0 #radix
for c in s:
if c not in dic:
dic[c]=d
d+=1
sm = 0
for c in p:
if c not in dic:return [-1]
sm = sm*d+dic[c]
ret = []
cur = 0
for i in range(m): cur=cur*d + dic[s[i]]
if cur==sm:ret.append(0)
tmp = n-m
q = getPrime(m)
cur = cur%q
sm = sm%q
exp = d**(m-1) % q
for i in range(m,n):
cur = ((cur-dic[s[i-m]]*exp)*d+dic[s[i]]) % q
if cur == sm and p==s[i-m+1:i+1]:
ret.append(i-m+1)
return ret
def randStr(n=3):
return [randint(ord('a'),ord('z')) for i in range(n)]
if __name__ =='__main__':
from random import randint
s = randStr(50)
p = randStr(1)
print(s)
print(p)
print(findAll(s,p))

BIN
string/src/compare.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 12 KiB

BIN
string/src/general.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 20 KiB

View File

@ -28,30 +28,50 @@ def find(s,p):
if s[ps] == p[pp]:
ps,pp = ps+1,pp+1
else:
idx = ps-pp+np
idx = ps+ np-pp
if idx >=ns:return -1
ch = s[idx]
if ch in dic:
ps += dic[ch]+1-pp
else:
ps += np-pp
ps = idx+1
pp = 0
if pp==np:return ps-np
else:
else:
return -1
def test():
s = [randint(78,88) for i in range(30)]
p = [randint(78,88) for i in range(3)]
def findAll(s,p):
ns = len(s)
np = len(p)
i = 0
ret = []
while s:
print(s,p)
tmp = find(s,p)
if tmp==-1: break
ret.append(i+tmp)
end = tmp+np
i +=end
s = s[end:]
return ret
def randStr(n=3):
return [randint(ord('a'),ord('z')) for i in range(n)]
def test(n):
s = randStr(n)
p = randStr(3)
str_s = ''.join((chr(i) for i in s))
str_p = ''.join((chr(i) for i in p))
n1 = find(s,p)
n2 = str_s.find(str_p)
n2 = str_s.find(str_p) # 利用已有的 str find 算法检验
if n1!=n2:
print(n1,n2,str_p,str_s)
return False
return True
if __name__ =='__main__':
from random import randint
n = 10000
suc = sum(test() for i in range(n))
print(f'test {n} times, success {suc} times')
n = 1000
suc = sum(test(n) for i in range(n))
print('test {n} times, success {suc} times'.format(n=n,suc=suc))