git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@8653 e1793c9e...
[qpalma.git] / scripts / PipelineHeuristic.py
index 0d128c8..4ea1cb9 100644 (file)
@@ -260,29 +260,50 @@ class PipelineHeuristic:
       print 'True neg / false neg : %d %d'%(self.true_neg,self.false_neg)
 
 
-   def findHighestScoringSpliceSites(self,currentAcc,currentDon):
+   def findHighestScoringSpliceSites(self, currentAcc, currentDon, DNA, max_intron_size, read_size, splice_thresh):
 
-      acc   = []
-      for idx,score in enumerate(currentAcc):
-         if score > -inf:
-            acc.append((idx,score))
-         if idx>self.read_size:
-             break 
+      def signum(a):
+          if a>0: 
+              return 1
+         elif a<0:
+              return -1
+          else:
+              return 0
 
-      acc.sort(lambda x,y: x[0]-y[0]) 
-      acc=acc[-2:]
-      
-      don   = []
-      for idx,score in enumerate(currentDon):
-         if score > -inf:
-            don.append((idx,score))
-         if idx>self.read_size:
-             break 
+      proximal_acc   = []
+      for idx in xrange(max_intron_size, max_intron_size+read_size):
+          if currentAcc[idx]>= splice_thresh:
+            proximal_acc.append((idx,currentAcc[idx]))
+
+      proximal_acc.sort(lambda x,y: signum(x[1]-y[1])) 
+      proximal_acc=proximal_acc[-2:]
+
+      distal_acc   = []
+      for idx in xrange(max_intron_size+read_size, len(currentAcc)):
+          if currentAcc[idx]>= splice_thresh:
+            distal_acc.append((idx, currentAcc[idx], DNA[idx+1:idx+read_size]))
+
+      #distal_acc.sort(lambda x,y: signum(x[1]-y[1])) 
+      #distal_acc=distal_acc[-2:]
+
+
+      proximal_don   = []
+      for idx in xrange(max_intron_size, max_intron_size+read_size):
+         if currentDon[idx] >= splice_thresh:
+            proximal_don.append((idx, currentDon[idx]))
 
-      don.sort(lambda x,y: x[0]-y[0]) 
-      don=don[-2:]
+      proximal_don.sort(lambda x,y: signum(x[1]-y[1]))
+      proximal_don=proximal_don[-2:]
 
-      return don,acc
+      distal_don   = []
+      for idx in xrange(1, max_intron_size):
+         if currentDon[idx] >= splice_thresh:
+            distal_don.append((idx, currentDon[idx], DNA[idx-read_size:idx]))
+
+      #distal_don.sort(lambda x,y: signum(x[1]-y[1]))
+      #distal_don=distal_don[-2:]
+
+      return proximal_acc,proximal_don,distal_acc,distal_don
 
    def calcAlternativeAlignments(self,location):
       """
@@ -290,15 +311,10 @@ class PipelineHeuristic:
       alternative alignments taking into account for example matched
       donor/acceptor positions.
       """
-      def signum(a):
-         if a>0: 
-            return 1
-         elif a<0:
-            return -1
-         else:
-            return 0
 
       run = self.run
+      splice_thresh = 0.01
+      max_intron_size = 2000 
 
       id       = location['id']
       chr      = location['chr']
@@ -311,8 +327,8 @@ class PipelineHeuristic:
       est = unbracket_est(original_est)
       effective_len = len(est)
 
-      genomicSeq_start  = pos
-      genomicSeq_stop   = pos+1000#self.read_size+1#+self.intron_size*2+self.read_size*2
+      genomicSeq_start  = pos - max_intron_size
+      genomicSeq_stop   = pos + max_intron_size + len(est)
 
       start = cpu()
       currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,run['dna_flat_files'])
@@ -320,15 +336,12 @@ class PipelineHeuristic:
       self.get_time += stop-start
       dna   = currentDNASeq
 
-      alt_don,alt_acc = self.findHighestScoringSpliceSites(currentAcc,currentDon)
-
-      sort_acc=[ elem for elem in enumerate(currentAcc) ] 
-      sort_acc.sort(lambda x,y: signum(x[1]-y[1]) )
-      
-      (max_acc_pos,max_acc_score)=sort_acc[-1]
-      
-      #print alt_don
-      #print alt_acc
+      proximal_acc,proximal_don,distal_acc,distal_don = self.findHighestScoringSpliceSites(currentAcc,currentDon, dna, max_intron_size, len(est), splice_thresh)
+       
+      print proximal_acc
+      print proximal_don
+      print distal_acc
+      print distal_don
       
       alternativeScores = []
       
@@ -348,7 +361,7 @@ class PipelineHeuristic:
       dummyDonorScore = calculatePlif(d, [0.25])[0]
       
       _start = cpu()
-      for (don_pos,don_score) in alt_don:
+      for (don_pos,don_score) in proximal_don:
          # remove mismatching positions in the second exon
          original_est_cut=''
 
@@ -363,7 +376,8 @@ class PipelineHeuristic:
                 if dna_ptr < don_pos:
                     original_est_cut+=original_est[ptr:ptr+4] 
                 else:
-                    original_est_cut+=estletter # EST letter
+                    #original_est_cut+=estletter # EST letter
+                    original_est_cut+=dnaletter # DNA letter
                 ptr+=4 
             else:
                 dnaletter=original_est[ptr]