heuristic seems to work well
authorraetsch <raetsch@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Fri, 18 Apr 2008 10:25:40 +0000 (10:25 +0000)
committerraetsch <raetsch@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Fri, 18 Apr 2008 10:25:40 +0000 (10:25 +0000)
git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@8668 e1793c9e-67f9-0310-80fc-b846ff1f7b36

scripts/PipelineHeuristic.py

index 66c13e3..8fee495 100644 (file)
@@ -18,7 +18,7 @@ from qpalma.compute_donacc import *
 from qpalma.TrainingParam import Param
 from qpalma.Plif import Plf
 
-from qpalma.tools.splicesites import getDonAccScores
+#from qpalma.tools.splicesites import getDonAccScores
 from qpalma.Configuration import *
 
 from compile_dataset import getSpliceScores, get_seq_and_scores
@@ -214,10 +214,13 @@ class PipelineHeuristic:
 
             start = cpu()
             # found no alternatives
-            if alternativeAlignmentScores == []:
-               continue
+            #if alternativeAlignmentScores == []:
+            #   continue
             
-            maxAlternativeAlignmentScore = max(alternativeAlignmentScores)
+            if alternativeAlignmentScores == []:
+                maxAlternativeAlignmentScore = -inf
+            else:
+                maxAlternativeAlignmentScore = max(alternativeAlignmentScores)
             #print 'vMatchScore/alternativeScore: %f %f ' % (vMatchScore,maxAlternativeAlignmentScore)
             #print 'all candidates %s' % str(alternativeAlignmentScores)
 
@@ -280,7 +283,7 @@ class PipelineHeuristic:
 
       distal_acc   = []
       for idx in xrange(max_intron_size+read_size, len(currentAcc)):
-          if currentAcc[idx]>= splice_thresh:
+          if currentAcc[idx]>= splice_thresh and idx+read_size<len(currentAcc):
             distal_acc.append((idx, currentAcc[idx], DNA[idx+1:idx+read_size]))
 
       #distal_acc.sort(lambda x,y: signum(x[1]-y[1])) 
@@ -297,10 +300,10 @@ class PipelineHeuristic:
 
       distal_don   = []
       for idx in xrange(1, max_intron_size):
-         if currentDon[idx] >= splice_thresh:
+         if currentDon[idx] >= splice_thresh and idx>read_size:
             distal_don.append((idx, currentDon[idx], DNA[idx-read_size:idx]))
 
-      #distal_don.sort(lambda x,y: signum(x[1]-y[1]))
+      distal_don.sort(lambda x,y: y[0]-x[0])
       #distal_don=distal_don[-2:]
 
       return proximal_acc,proximal_don,distal_acc,distal_don
@@ -314,7 +317,8 @@ class PipelineHeuristic:
 
       run = self.run
       splice_thresh = 0.1
-      max_intron_size = 2000 
+      max_intron_size = 2000
+      bias = 0.0
 
       id       = location['id']
       chr      = location['chr']
@@ -338,10 +342,10 @@ class PipelineHeuristic:
 
       proximal_acc,proximal_don,distal_acc,distal_don = self.findHighestScoringSpliceSites(currentAcc,currentDon, dna, max_intron_size, len(est), splice_thresh)
        
-      print proximal_acc
-      print proximal_don
-      print distal_acc
-      print distal_don
+      #print proximal_acc
+      #print proximal_don
+      #print distal_acc
+      #print distal_don
       
       alternativeScores = []
       
@@ -352,21 +356,16 @@ class PipelineHeuristic:
       mmatrix = self.mmatrix
       qualityPlifs = self.qualityPlifs
       # inlined
-
-      # compute dummy scores
-      #IntronScore = calculatePlif(h, [math.fabs(max_acc_pos-30)])[0]
-      #dummyAcceptorScore = calculatePlif(a, [max_acc_score])[0] 
       
       _start = cpu()
       for (don_pos,don_score) in proximal_don:
-
           for (acc_pos,acc_score,acc_dna) in distal_acc:
 
               DonorScore = calculatePlif(d, [don_score])[0]
               IntronScore = calculatePlif(h, [acc_pos-don_pos])[0] 
               AcceptorScore = calculatePlif(a, [acc_score])[0] 
           
-              print 'splice: ', (don_pos,don_score), (acc_pos,acc_score,acc_dna), (DonorScore,IntronScore,AcceptorScore)
+              #print 'don splice: ', (don_pos,don_score), (acc_pos,acc_score,acc_dna), (DonorScore,IntronScore,AcceptorScore)
 
               # construct a new "original_est"
               original_est_cut=''
@@ -387,14 +386,23 @@ class PipelineHeuristic:
                               original_est_cut += estletter # EST letter
                           else:
                               original_est_cut += '['+acc_dna[acc_dna_ptr]+estletter+']' # EST letter
-                              print '['+acc_dna[acc_dna_ptr]+estletter+']' 
+                              #print '['+acc_dna[acc_dna_ptr]+estletter+']' 
                           acc_dna_ptr+=1 
                       ptr+=4 
                   else:
                       dnaletter=original_est[ptr]
                       estletter=dnaletter
                 
-                      original_est_cut+=estletter # EST letter
+                      if dna_ptr < don_pos:
+                          original_est_cut+=estletter # EST letter
+                      else:
+                          if acc_dna[acc_dna_ptr]==estletter:
+                              original_est_cut += estletter # EST letter
+                          else:
+                              original_est_cut += '['+acc_dna[acc_dna_ptr]+estletter+']' # EST letter
+                              #print '('+acc_dna[acc_dna_ptr]+estletter+')' 
+                          acc_dna_ptr+=1 
+                          
                       ptr+=1
 
                   if estletter=='-':
@@ -408,67 +416,82 @@ class PipelineHeuristic:
               assert(dna_ptr<=len(dna))
               assert(est_ptr<=len(est))
 
-              print original_est, original_est_cut              
+              #print original_est, original_est_cut              
 
               score = computeSpliceAlignScoreWithQuality(original_est_cut, quality, qualityPlifs, run, self.currentPhi)
-              score += AcceptorScore + IntronScore + DonorScore
+              score += AcceptorScore + IntronScore + DonorScore + bias 
          
               alternativeScores.append(score)
 
       _stop = cpu()
       self.alternativeScoresTime += _stop-_start
 
+      # acceptor
       _start = cpu()
-      for (acc_pos,acc_score) in alt_acc:
-         # remove mismatching positions in the second exon
-         original_est_cut=''
-
-         est_ptr=0
-         dna_ptr=0
-         ptr=0 
-         while ptr<len(original_est):
-             
-            if original_est[ptr]=='[':
-                dnaletter=original_est[ptr+1]
-                estletter=original_est[ptr+2]
-                if est_ptr>=acc_pos:
-                    original_est_cut+=original_est[ptr:ptr+4] 
-                else:
-                    original_est_cut+=estletter # EST letter
-                ptr+=4 
-            else:
-                dnaletter=original_est[ptr]
-                estletter=dnaletter
+      for (acc_pos,acc_score) in proximal_acc:
+          for (don_pos,don_score,don_dna) in distal_don:
+
+              DonorScore = calculatePlif(d, [don_score])[0]
+              IntronScore = calculatePlif(h, [acc_pos-don_pos])[0] 
+              AcceptorScore = calculatePlif(a, [acc_score])[0] 
+          
+              #print 'acc splice: ', (don_pos,don_score,don_dna), (acc_pos,acc_score), (DonorScore,IntronScore,AcceptorScore)
+
+              # construct a new "original_est"
+              original_est_cut=''
+              
+              est_ptr=0
+              dna_ptr=max_intron_size
+              ptr=0
+              don_dna_ptr=len(don_dna)-(acc_pos-max_intron_size)-1
+              while ptr<len(original_est):
+                  
+                  if original_est[ptr]=='[':
+                      dnaletter=original_est[ptr+1]
+                      estletter=original_est[ptr+2]
+                      if dna_ptr > acc_pos:
+                          original_est_cut+=original_est[ptr:ptr+4] 
+                      else:
+                          if don_dna[don_dna_ptr]==estletter:
+                              original_est_cut += estletter # EST letter
+                          else:
+                              original_est_cut += '['+don_dna[don_dna_ptr]+estletter+']' # EST letter
+                              #print '['+don_dna[don_dna_ptr]+estletter+']' 
+                          don_dna_ptr+=1 
+                      ptr+=4 
+                  else:
+                      dnaletter=original_est[ptr]
+                      estletter=dnaletter
                 
-                original_est_cut+=estletter # EST letter
-                ptr+=1
+                      if dna_ptr > acc_pos:
+                          original_est_cut+=estletter # EST letter
+                      else:
+                          if don_dna[don_dna_ptr]==estletter:
+                              original_est_cut += estletter # EST letter
+                          else:
+                              original_est_cut += '['+don_dna[don_dna_ptr]+estletter+']' # EST letter
+                              #print '('+don_dna[don_dna_ptr]+estletter+')' 
+                          don_dna_ptr+=1 
+                          
+                      ptr+=1
 
-            if estletter=='-':
-                dna_ptr+=1 
-            elif dnaletter=='-':
-                est_ptr+=1
-            else:
-                dna_ptr+=1 
-                est_ptr+=1
+                  if estletter=='-':
+                      dna_ptr+=1 
+                  elif dnaletter=='-':
+                      est_ptr+=1
+                  else:
+                      dna_ptr+=1 
+                      est_ptr+=1
                          
-         assert(dna_ptr<=len(dna))
-         assert(est_ptr<=len(est))
+              assert(dna_ptr<=len(dna))
+              assert(est_ptr<=len(est))
 
-         #print original_est,original_est_cut
-         
-         AcceptorScore = calculatePlif(d, [acc_score])[0]
-         #print "Acceptor"
-         #print AcceptorScore,acc_score,acc_pos
+              #print original_est, original_est_cut              
 
-         #if acc_score<0.1:
-         #    print currentAcc[0:50]
-         #    print currentDon[0:50]
-         
-         score = computeSpliceAlignScoreWithQuality(original_est_cut, quality, qualityPlifs, run, self.currentPhi)
-         score += AcceptorScore + IntronScore + dummyDonorScore
+              score = computeSpliceAlignScoreWithQuality(original_est_cut, quality, qualityPlifs, run, self.currentPhi)
+              score += AcceptorScore + IntronScore + DonorScore + bias
          
-         #print 'diff %f,%f,%f' % ((trueWeight.T * self.currentPhi)[0,0] - score,(trueWeight.T * self.currentPhi)[0,0], score)
-         alternativeScores.append(score)
+              alternativeScores.append(score)
 
       _stop = cpu()
       self.alternativeScoresTime += _stop-_start