optimized heuristic
authorraetsch <raetsch@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Fri, 18 Apr 2008 12:07:09 +0000 (12:07 +0000)
committerraetsch <raetsch@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Fri, 18 Apr 2008 12:07:09 +0000 (12:07 +0000)
git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@8671 e1793c9e-67f9-0310-80fc-b846ff1f7b36

scripts/PipelineHeuristic.py
scripts/compile_dataset.py

index 5d4c392..c593762 100644 (file)
@@ -80,11 +80,14 @@ class PipelineHeuristic:
       self.mmatrix = mmatrix
       self.qualityPlifs = qualityPlifs
 
-      # when we look for alternative alignments with introns this value is the
-      # mean of intron size
-      self.intron_size  = 250
-
       self.read_size    = 36
+
+      # parameters of the heuristics to decide whether the read is spliced
+      self.splice_thresh      = 0.005
+      self.max_intron_size    = 2000
+      self.max_mismatch       = 2 
+      self.splice_stop_thresh = 0.99
+      self.spliced_bias       = 0.0
    
       self.original_reads = {}
 
@@ -288,7 +291,7 @@ class PipelineHeuristic:
               return 0
 
       proximal_acc   = []
-      for idx in xrange(max_intron_size, max_intron_size+read_size):
+      for idx in xrange(max_intron_size, max_intron_size+read_size/2):
           if currentAcc[idx]>= splice_thresh:
             proximal_acc.append((idx,currentAcc[idx]))
 
@@ -305,7 +308,7 @@ class PipelineHeuristic:
 
 
       proximal_don   = []
-      for idx in xrange(max_intron_size, max_intron_size+read_size):
+      for idx in xrange(max_intron_size+read_size/2, max_intron_size+read_size):
          if currentDon[idx] >= splice_thresh:
             proximal_don.append((idx, currentDon[idx]))
 
@@ -330,9 +333,6 @@ class PipelineHeuristic:
       """
 
       run = self.run
-      splice_thresh = 0.1
-      max_intron_size = 2000
-      bias = 0.0
 
       id       = location['id']
       chr      = location['chr']
@@ -345,22 +345,17 @@ class PipelineHeuristic:
       est = unbracket_est(original_est)
       effective_len = len(est)
 
-      genomicSeq_start  = pos - max_intron_size
-      genomicSeq_stop   = pos + max_intron_size + len(est)
+      genomicSeq_start  = pos - self.max_intron_size
+      genomicSeq_stop   = pos + self.max_intron_size + len(est)
 
       start = cpu()
-      currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,run['dna_flat_files'])
+      currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chr, strand, genomicSeq_start, genomicSeq_stop, run['dna_flat_files'])
       stop = cpu()
       self.get_time += stop-start
       dna   = currentDNASeq
 
-      proximal_acc,proximal_don,distal_acc,distal_don = self.findHighestScoringSpliceSites(currentAcc,currentDon, dna, max_intron_size, len(est), splice_thresh)
+      proximal_acc,proximal_don,distal_acc,distal_don = self.findHighestScoringSpliceSites(currentAcc, currentDon, dna, self.max_intron_size, len(est), self.splice_thresh)
        
-      #print proximal_acc
-      #print proximal_don
-      #print distal_acc
-      #print distal_don
-      
       alternativeScores = []
       
       # inlined
@@ -370,12 +365,14 @@ class PipelineHeuristic:
       mmatrix = self.mmatrix
       qualityPlifs = self.qualityPlifs
       # inlined
-      
+
+      # find an intron on the 3' end
       _start = cpu()
       for (don_pos,don_score) in proximal_don:
+          DonorScore = calculatePlif(d, [don_score])[0]
+
           for (acc_pos,acc_score,acc_dna) in distal_acc:
 
-              DonorScore = calculatePlif(d, [don_score])[0]
               IntronScore = calculatePlif(h, [acc_pos-don_pos])[0] 
               AcceptorScore = calculatePlif(a, [acc_score])[0] 
           
@@ -385,21 +382,26 @@ class PipelineHeuristic:
               original_est_cut=''
               
               est_ptr=0
-              dna_ptr=max_intron_size
+              dna_ptr=self.max_intron_size
               ptr=0
-              acc_dna_ptr=0 
+              acc_dna_ptr=0
+              num_mismatch = 0
+              
               while ptr<len(original_est):
-             
+                  #print acc_dna_ptr,len(acc_dna),acc_pos,don_pos
+                  
                   if original_est[ptr]=='[':
                       dnaletter=original_est[ptr+1]
                       estletter=original_est[ptr+2]
                       if dna_ptr < don_pos:
-                          original_est_cut+=original_est[ptr:ptr+4] 
+                          original_est_cut+=original_est[ptr:ptr+4]
+                          num_mismatch += 1
                       else:
                           if acc_dna[acc_dna_ptr]==estletter:
                               original_est_cut += estletter # EST letter
                           else:
                               original_est_cut += '['+acc_dna[acc_dna_ptr]+estletter+']' # EST letter
+                              num_mismatch += 1
                               #print '['+acc_dna[acc_dna_ptr]+estletter+']' 
                           acc_dna_ptr+=1 
                       ptr+=4 
@@ -413,6 +415,7 @@ class PipelineHeuristic:
                           if acc_dna[acc_dna_ptr]==estletter:
                               original_est_cut += estletter # EST letter
                           else:
+                              num_mismatch += 1
                               original_est_cut += '['+acc_dna[acc_dna_ptr]+estletter+']' # EST letter
                               #print '('+acc_dna[acc_dna_ptr]+estletter+')' 
                           acc_dna_ptr+=1 
@@ -426,6 +429,8 @@ class PipelineHeuristic:
                   else:
                       dna_ptr+=1 
                       est_ptr+=1
+              if num_mismatch>self.max_mismatch:
+                  continue
                          
               assert(dna_ptr<=len(dna))
               assert(est_ptr<=len(est))
@@ -433,21 +438,26 @@ class PipelineHeuristic:
               #print original_est, original_est_cut              
 
               score = computeSpliceAlignScoreWithQuality(original_est_cut, quality, qualityPlifs, run, self.currentPhi)
-              score += AcceptorScore + IntronScore + DonorScore + bias 
+              score += AcceptorScore + IntronScore + DonorScore + self.spliced_bias 
          
               alternativeScores.append(score)
 
+              if acc_score>=self.splice_stop_thresh:
+                  break
+              
       _stop = cpu()
       self.alternativeScoresTime += _stop-_start
 
-      # acceptor
+      # find an intron on the 5' end 
       _start = cpu()
       for (acc_pos,acc_score) in proximal_acc:
+
+          AcceptorScore = calculatePlif(a, [acc_score])[0]
+          
           for (don_pos,don_score,don_dna) in distal_don:
 
               DonorScore = calculatePlif(d, [don_score])[0]
               IntronScore = calculatePlif(h, [acc_pos-don_pos])[0] 
-              AcceptorScore = calculatePlif(a, [acc_score])[0] 
           
               #print 'acc splice: ', (don_pos,don_score,don_dna), (acc_pos,acc_score), (DonorScore,IntronScore,AcceptorScore)
 
@@ -455,9 +465,10 @@ class PipelineHeuristic:
               original_est_cut=''
               
               est_ptr=0
-              dna_ptr=max_intron_size
+              dna_ptr=self.max_intron_size
               ptr=0
-              don_dna_ptr=len(don_dna)-(acc_pos-max_intron_size)-1
+              num_mismatch = 0
+              don_dna_ptr=len(don_dna)-(acc_pos-self.max_intron_size)-1
               while ptr<len(original_est):
                   
                   if original_est[ptr]=='[':
@@ -465,11 +476,13 @@ class PipelineHeuristic:
                       estletter=original_est[ptr+2]
                       if dna_ptr > acc_pos:
                           original_est_cut+=original_est[ptr:ptr+4] 
+                          num_mismatch += 1
                       else:
                           if don_dna[don_dna_ptr]==estletter:
                               original_est_cut += estletter # EST letter
                           else:
                               original_est_cut += '['+don_dna[don_dna_ptr]+estletter+']' # EST letter
+                              num_mismatch += 1
                               #print '['+don_dna[don_dna_ptr]+estletter+']' 
                           don_dna_ptr+=1 
                       ptr+=4 
@@ -484,6 +497,7 @@ class PipelineHeuristic:
                               original_est_cut += estletter # EST letter
                           else:
                               original_est_cut += '['+don_dna[don_dna_ptr]+estletter+']' # EST letter
+                              num_mismatch += 1
                               #print '('+don_dna[don_dna_ptr]+estletter+')' 
                           don_dna_ptr+=1 
                           
@@ -497,16 +511,21 @@ class PipelineHeuristic:
                       dna_ptr+=1 
                       est_ptr+=1
                          
+              if num_mismatch>self.max_mismatch:
+                  continue
               assert(dna_ptr<=len(dna))
               assert(est_ptr<=len(est))
 
               #print original_est, original_est_cut              
 
               score = computeSpliceAlignScoreWithQuality(original_est_cut, quality, qualityPlifs, run, self.currentPhi)
-              score += AcceptorScore + IntronScore + DonorScore + bias
+              score += AcceptorScore + IntronScore + DonorScore + self.spliced_bias
          
               alternativeScores.append(score)
 
+              if don_score>=self.splice_stop_thresh:
+                  break
+              
       _stop = cpu()
       self.alternativeScoresTime += _stop-_start
 
index 3b56946..472664d 100644 (file)
@@ -7,6 +7,7 @@ import os
 import re
 import pdb
 import cPickle
+#import resource
 
 import numpy
 from numpy.matlib import mat,zeros,ones,inf
@@ -66,6 +67,11 @@ info = """
    - 'remappped_pos' - the position of the remapped read
    """
 
+
+#def cpu():
+#   return (resource.getrusage(resource.RUSAGE_SELF).ru_utime+\
+#   resource.getrusage(resource.RUSAGE_SELF).ru_stime) 
+
 # some useful constants
 
 extended_alphabet = ['-','a','c','g','t','n','[',']']
@@ -466,10 +472,13 @@ def get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,dna_flat_file
    returns then the genomic sequence of this interval and the associated scores.
    """
 
+   #start = cpu()
+   
    chrom         = 'chr%d' % chr
    genomicSeq = load_genomic(chrom,strand,genomicSeq_start-1,genomicSeq_stop,dna_flat_files,one_based=False)
    genomicSeq = genomicSeq.lower()
 
+
    # check the obtained dna sequence
    assert genomicSeq != '', 'load_genomic returned empty sequence!'
    #for elem in genomicSeq:
@@ -478,12 +487,22 @@ def get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,dna_flat_file
    no_base = re.compile('[^acgt]')
    genomicSeq = no_base.sub('n',genomicSeq)
 
+   #stop = cpu()
+   #load_genomic_time = stop-start
+
+   #start = cpu()
+
    intervalBegin  = genomicSeq_start-100
    intervalEnd    = genomicSeq_stop+100
    seq_pos_offset = genomicSeq_start
 
    currentAcc, currentDon = getSpliceScores(chr,strand,intervalBegin,intervalEnd)
 
+   #stop = cpu()
+   #getSpliceScores_time = stop-start
+
+   #start = cpu()
+   
    currentAcc = currentAcc[100:-98]
    currentAcc = currentAcc[1:]
    currentDon = currentDon[100:-100]
@@ -500,6 +519,11 @@ def get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,dna_flat_file
    assert gt_tuple_pos == [p for p,e in enumerate(currentDon) if e != -inf and p > 0], pdb.set_trace()
    assert len(currentAcc) == len(currentDon)
 
+   #stop = cpu()
+   #rest_time = stop-start
+
+   #print "time\t%f\t%f\t%f\n" % (load_genomic_time, getSpliceScores_time, rest_time)
+   
    return genomicSeq, currentAcc, currentDon