git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@8633 e1793c9e...
authorraetsch <raetsch@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Thu, 17 Apr 2008 21:22:59 +0000 (21:22 +0000)
committerraetsch <raetsch@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Thu, 17 Apr 2008 21:22:59 +0000 (21:22 +0000)
scripts/PipelineHeuristic.py

index ad46368..16f14cd 100644 (file)
@@ -298,15 +298,12 @@ class PipelineHeuristic:
       chr      = location['chr']
       pos      = location['pos']
       strand   = location['strand']
-      seq      = location['seq']
-      #orig_seq = location['orig_seq']
-      prb      = location['prb']
+      original_est = location['seq']
+      quality      = location['prb']
       cal_prb  = location['cal_prb']
       
-      orig_seq = self.original_reads[int(id)]
-
-      unb_seq = unbracket_est(seq)
-      effective_len = len(unb_seq)
+      est = unbracket_est(seq)
+      effective_len = len(est)
 
       genomicSeq_start  = pos
       genomicSeq_stop   = pos+self.intron_size*2+self.read_size*2
@@ -316,20 +313,11 @@ class PipelineHeuristic:
       stop = cpu()
       self.get_time += stop-start
 
-      dna   = currentDNASeq
+      #dna   = currentDNASeq
 
-      start = cpu()
       alt_don,alt_acc = self.findHighestScoringSpliceSites(currentAcc,currentDon)
-      stop = cpu()
-      self.splice_site_time = stop-start
-
       alternativeScores = []
-
-      exons          = zeros((2,2),dtype=numpy.int)
-      est            = unb_seq
-      original_est   = seq
-      quality        = prb
-
+      
       # inlined
       h = self.h
       d = self.d
@@ -338,79 +326,61 @@ class PipelineHeuristic:
       qualityPlifs = self.qualityPlifs
       # inlined
 
+      # compute dummy scores
       IntronScore = calculatePlif(h, [self.intron_size+1])[0]
       dummyAcceptorScore = calculatePlif(a, [0.25])[0] 
       dummyDonorScore = calculatePlif(d, [0.25])[0]
       
       _start = cpu()
       for (don_pos,don_score) in alt_don:
-         """
-         start = cpu()
-
-         acc_pos = don_pos + self.intron_size
-         
-         exons[0,0]     = 0
-         exons[0,1]     = don_pos
-         exons[1,0]     = acc_pos+1
-         exons[1,1]     = acc_pos+1+(self.read_size-don_pos)
-
-         _dna = dna[:int(exons[1,1])]
-         _dna = _dna[:exons[1,0]] + est[don_pos:]# only correct if there are no indels!!!
-
-         _currentAcc = currentAcc[:int(exons[1,1])]
-         _currentAcc = [0.25]*len(_currentAcc) 
-
-         _currentDon = currentDon[:int(exons[1,1])]
-         #_currentDon = [0.25]*len(_currentDon)
-
-         currentVMatchAlignment = _dna, exons, est, original_est, quality,\
-         _currentAcc, _currentDon
-
-         stop = cpu()
-         self.array_stuff += stop - start
-
-         #alternativeScore = self.calcAlignmentScore(currentVMatchAlignment)
-         #alternativeScores.append(self.calcAlignmentScore(currentVMatchAlignment))
-
-         # Lets start calculation
-         dna, exons, est, original_est, quality, acc_supp, don_supp =\
-         currentVMatchAlignment
-
-         # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)    
-         trueSpliceAlign, trueWeightMatch, trueWeightQuality ,dna_calc =\
-         computeSpliceAlignWithQuality(dna, exons, est, original_est,\
-         quality, qualityPlifs, run)
+         # remove mismatching positions in the second exon
+         original_est_cut=''
 
-         stop = cpu()
-         self.computeSpliceAlignWithQualityTime += stop-start
-         start = cpu()
+         est_ptr=0
+         dna_ptr=0
+         ptr=0 
+         while ptr<len(original_est):
+             
+            if original_est[ptr]=='[':
+                dnaletter=original_est[ptr+1]
+                estletter=original_est[ptr+2]
+                if dna_ptr < don_pos:
+                    original_est_cut+=original_est[ptr:ptr+4] 
+                else:
+                    original_est_cut+=estletter # EST letter
+                ptr+=4 
+            else:
+                dnaletter=original_est[ptr]
+                estletter=dnaletter
+                
+                original_est_cut+=estletter # EST letter
+                ptr+=1
 
-         # Calculate the weights
-         trueWeightDon, trueWeightAcc, trueWeightIntron =\
-         computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
+            if estletter=='-':
+                dna_ptr+=1 
+            elif dnaletter=='-':
+                est_ptr+=1
+            else:
+                dna_ptr+=1 
+                est_ptr+=1
+                         
+         assert(dna_ptr<=len(dna))
+         assert(est_ptr<=len(est))
 
-         #for i in xrange(0,len(trueWeightDon)):
-         #    trueWeightDon[i]=0.0
-         #for i in xrange(0,len(trueWeightAcc)):
-         #    trueWeightAcc[i]=0.0
-         #for i in xrange(0,len(trueWeightIntron)):
-         #    trueWeightIntron[i]=0.0
+         DonorScore = calculatePlif(d, [don_score])[0]
+         #print DonorScore,don_score
          
-         stop = cpu()
-         self.computeSpliceWeightsTime += stop-start
-
-         start = cpu()
-
-         trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
-         stop = cpu()
-         self.DotProdTime += stop-start
+         score = computeSpliceAlignScoreWithQuality(original_est_cut, quality, qualityPlifs, run, self.currentPhi)
+         score += dummyAcceptorScore + IntronScore + DonorScore
+         
+         #print 'diff %f,%f,%f' % ((trueWeight.T * self.currentPhi)[0,0] - score,(trueWeight.T * self.currentPhi)[0,0], score)
+         alternativeScores.append(score)
 
-         # Calculate w'phi(x,y) the total score of the alignment
-         alternativeScores.append((trueWeight.T * self.currentPhi)[0,0])
-         """
+      _stop = cpu()
+      self.alternativeScoresTime += _stop-_start
 
-         # new score computation
-         
+      _start = cpu()
+      for (acc_pos,acc_score) in alt_acc:
          # remove mismatching positions in the second exon
          original_est_cut=''
 
@@ -422,7 +392,7 @@ class PipelineHeuristic:
             if original_est[ptr]=='[':
                 dnaletter=original_est[ptr+1]
                 estletter=original_est[ptr+2]
-                if est_ptr<=exons[0,1]:
+                if est_ptr>=acc_pos:
                     original_est_cut+=original_est[ptr:ptr+4] 
                 else:
                     original_est_cut+=estletter # EST letter
@@ -445,11 +415,13 @@ class PipelineHeuristic:
          assert(dna_ptr<=len(dna))
          assert(est_ptr<=len(est))
 
-         DonorScore = calculatePlif(d, [don_score])[0]
-         print DonorScore,don_score
+         print original_est,original_est_cut
+         
+         AcceptorScore = calculatePlif(d, [acc_score])[0]
+         print AcceptorScore,acc_score
          
          score = computeSpliceAlignScoreWithQuality(original_est_cut, quality, qualityPlifs, run, self.currentPhi)
-         score += dummyAcceptorScore + IntronScore + DonorScore
+         score += AcceptorScore + IntronScore + dummyDonorScore
          
          #print 'diff %f,%f,%f' % ((trueWeight.T * self.currentPhi)[0,0] - score,(trueWeight.T * self.currentPhi)[0,0], score)
          alternativeScores.append(score)
@@ -497,8 +469,8 @@ if __name__ == '__main__':
    run_fname   = jp(dir,'run_object.pickle')
    #data_fname = '/fml/ag-raetsch/share/projects/qpalma/solexa/current_data/map.vm_unspliced_flag'
 
-   #data_fname  = '/fml/ag-raetsch/share/projects/qpalma/solexa/pipeline_data/map.vm_2k'
-   data_fname  = '/fml/ag-raetsch/share/projects/qpalma/solexa/pipeline_data/map.vm_100'
+   data_fname  = '/fml/ag-raetsch/share/projects/qpalma/solexa/pipeline_data/map.vm_2k'
+   #data_fname  = '/fml/ag-raetsch/share/projects/qpalma/solexa/pipeline_data/map.vm_100'
 
    param_fname = jp(dir,'param_500.pickle')