La prima cosa importante è quella di preprocessare la nostra sequenza query in modo da avere un dizionario che abbia per chiave ogni tripletta di residui contigui appartenenti alla nostra query e per valore l'elenco delle
def preprocessQuery(query,mat,residues): ''' preprocessQuery(query,mat,residues) ''' simpos={} simwords={} for i in range(len(query)-2): word=query[i:i+3] simwords[word]=simwords.get(word,findSumWords(word,mat,residues)) for k in simwords[word]: simpos[k]=simpos.get(k,[]) simpos[k].append(i) return simpos #------------------------#
class PSEUDOBLAST: ''' emulate ungapped blast ''' def __init__(self,filedbase,filequery,fmatrix='blosum62.mat',wordth=11,numalign=25): ''' __init__(self,filedbase,filequery,fmatrix='blosum62.mat',wordth=11,numalign=25) ''' import Seq self.db=Seq.readFasta(filedbase) self.queries=Seq.readFasta(filequery) self.alphabet,self.mat=getmat(fmatrix) self.wth=wordth self.numalign=numalign def findSumWords(self,word): ''' findSumWords(word) ''' score=0 for i in range(len(word)): score+=self.mat[word[i],word[i]] if score < self.wth : return [word] similar=[] for r1 in self.alphabet: for r2 in self.alphabet: for r3 in self.alphabet: if(self.mat[word[0],r1]+self.mat[word[1],r2]+self.mat[word[2],r3] >= self.wth): similar.append(r1+r2+r3) return similar #------------------------# def preprocessQuery(self,query): ''' preprocessQuery(query) ''' simpos={} simwords={} for i in range(len(query)-2): word=query[i:i+3] simwords[word]=simwords.get(word,self.findSumWords(word)) for k in simwords[word]: simpos[k]=simpos.get(k,[]) simpos[k].append(i) return simpos #------------------------# def extendHit(self,query,qpos,target,tpos): ''' extendHit(query,qpos,target,tpos) ''' score=0 for i in range(3): score+=self.mat[target[tpos+i],query[qpos+i]] maxscore=score left,right=0,2 # extend on the right ilim=min(len(target)-tpos,len(query)-qpos) i=3 while(i<ilim and score > 0): score+=self.mat[target[tpos+i],query[qpos+i]] if score > maxscore: maxscore,right=score,i i+=1 # extend on the left score=maxscore ilim=min(tpos,qpos) i=1 while(i<ilim and score > 0): score+=self.mat[target[tpos-i],query[qpos-i]] if score > maxscore: maxscore,left=score,i i+=1 return maxscore,(qpos-left,qpos+right+1),(tpos-left,tpos+right+1) #----------------------------# def isin(self,pos1,pos2,pairlist): ''' isin(self,pos1,pos2,pairlist) returns 1 if pos1 and pos2 are inside a pair of a list 0 otherwise ''' i=0 inlist=0 while i<len(pairlist) and not inlist: (b1,e1),(b2,e2)=pairlist[i] if (pos1 >= b1 and pos1 <=e1) and (pos2 >= b2 and pos2 <=e2): inlist=1 i+=1 return inlist #----------------------------# def scanTarget(self,target,tid,query,qid,qindex): ''' scanTarget(target,tid,query,qid,qindex) ''' maxnum=self.numalign alignments=[] qpositions=[] for i in range(len(target)-2): word=target[i:i+3] tpos=i for qpos in qindex.get(word,[]): if not self.isin(qpos,i,qpositions): align=self.extendHit(query,qpos,target,tpos) qpositions.append(align[1:]) alignments.append((align[0],align[1:],tid,qid)) alignments.sort() minidx=min(len(alignments),maxnum) alignments=alignments[-minidx:] alignments.reverse() return alignments #----------------------------# def run(self): ''' run(self) ''' alph,mat=self.alphabet,self.mat for qid in self.queries.getNames(): query=self.queries.seqs[qid] qidx=self.preprocessQuery(query) alignments=[] for tid in self.db.getNames(): target=self.db.seqs[tid] # print "process",qid,"against",tid alignments.extend(self.scanTarget(target,tid,query,qid,qidx)) alignments.sort() minidx=min(len(alignments),self.numalign) alignments=alignments[-minidx:] alignments.reverse() print "\n\n\n\n\nthe best ",self.numalign,"alignments for query",qid for k in alignments: score,((bq,eq),(bt,et)),tname,qname=k # print k print "score",score,"target",tname print " ",bq+1," "*(eq-bq),eq print "query ",query[bq:eq] print " ",bt+1," "*(et-bt),et print "target",self.db.seqs[tname][bt:et] print "\n" #------------------------------------------# # end class # def getmat(filename): ''' getmat(filename) return alphabet,mat mat is a dictionary a,a => values ''' import sys import string try: lines=open(filename,'r').readlines() except: sys.stderr.write('cannot open file '+filename+'\n') sys.exit() mat={} alphabet=lines[0].split() for line in lines[1:]: v=line.split() for i in range(len(alphabet)): mat[(v[0],alphabet[i])]=string.atoi(v[i+1]) return alphabet,mat #------------------------#