from collections import deque def buildAutomata(words,MAXC=26): MAXS=0 for w in words: MAXS+=len(w) states, k, out, f, g = 1, len(words), [0]*MAXS, [-1]*MAXS, [[-1]*MAXC for _ in xrange(MAXS)] for i in xrange(k): currentState=0 for ch in words[i]: x=ord(ch)-ord('a') if(g[currentState][x]==-1): g[currentState][x]=states states+=1 currentState=g[currentState][x] out[currentState]|=(1<<i) for x in xrange(MAXC): if(g[0][x]==-1): g[0][x]=0 Q=deque() for x in xrange(MAXC): if(g[0][x]!=0): f[g[0][x]]=0 Q.append(g[0][x]) while(len(Q)>0): state=Q.popleft() for x in xrange(MAXC): if(g[state][x]!=-1): failure=f[state] while(g[failure][x]==-1): failure=f[failure] failure=g[failure][x] f[g[state][x]]=failure out[g[state][x]]|=out[failure] Q.append(g[state][x]) return out,f,g #--- def nextState(AUTOMATA, currentState,nextInput): out,f,g=AUTOMATA[0],AUTOMATA[1],AUTOMATA[2] answer, x = currentState, ord(nextInput)-ord('a') while(g[answer][x]==-1): answer=f[answer] return g[answer][x] #--- def searchWords(AUTOMATA, text, words): out,f,g=AUTOMATA[0],AUTOMATA[1],AUTOMATA[2] currentState=0 for i in xrange(len(text)): currentState=nextState(AUTOMATA, currentState,text[i]) if(out[currentState]==0): continue for j in xrange(len(words)): if(out[currentState] & (1<<j)): print 'Word '+words[j]+' appears from '+str(i-len(words[j])+1)+' to '+str(i) #--- words=["he","she","hers","his"] text="ahehishers" AUTOMATA=buildAutomata(words) searchWords(AUTOMATA, text,words)
Aho-Corasick (Python)
Subscribe to:
Posts (Atom)