+ added some testcases
[qpalma.git] / scripts / PipelineHeuristic.py
index 974fb7d..e6e3a39 100644 (file)
@@ -6,45 +6,50 @@ import sys
 import pdb
 import os
 import os.path
-import math
 import resource
 
-from qpalma.DataProc import *
 from qpalma.computeSpliceWeights import *
 from qpalma.set_param_palma import *
 from qpalma.computeSpliceAlignWithQuality import *
-from qpalma.penalty_lookup_new import *
-from qpalma.compute_donacc import *
-from qpalma.TrainingParam import Param
-from qpalma.Plif import Plf
-
-from qpalma.tools.splicesites import getDonAccScores
-from qpalma.Configuration import *
-
-from compile_dataset import getSpliceScores, get_seq_and_scores
 
 from numpy.matlib import mat,zeros,ones,inf
 from numpy import inf,mean
 
-from qpalma.parsers import PipelineReadParser
+from qpalma.parsers import *
+
+from ParaParser import *
+
+from qpalma.Lookup import LookupTable
+
+from qpalma.sequence_utils import reverse_complement,unbracket_seq
 
 
-def unbracket_est(est):
-   new_est = ''
-   e = 0
+class BracketWrapper:
+   #fields = ['id', 'chr', 'pos', 'strand', 'mismatches', 'length',\
+   #'offset', 'seq', 'prb', 'cal_prb', 'chastity']
+   fields = ['id', 'chr', 'pos', 'strand', 'seq', 'prb']
 
-   while True:
-      if e >= len(est):
-         break
+   def __init__(self,filename):
+      self.parser = ParaParser("%lu%d%d%s%s%s",self.fields,len(self.fields),IN_VECTOR)
+      self.parser.parseFile(filename)
 
-      if est[e] == '[':
-         new_est += est[e+2]
-         e += 4
-      else:
-         new_est += est[e]
-         e += 1
+   def __len__(self):
+      return self.parser.getSize(IN_VECTOR)
+      
+   def __getitem__(self,key):
+      return self.parser.fetchEntry(key)
+
+   def __iter__(self):
+      self.counter = 0
+      self.size = self.parser.getSize(IN_VECTOR)
+      return self
 
-   return "".join(new_est).lower()
+   def next(self):
+      if not self.counter < self.size:
+         raise StopIteration
+      return_val = self.parser.fetchEntry(self.counter)
+      self.counter += 1
+      return return_val
 
 
 class PipelineHeuristic:
@@ -53,7 +58,7 @@ class PipelineHeuristic:
    vmatch is spliced an should be then newly aligned using QPalma or not.
    """
 
-   def __init__(self,run_fname,data_fname,param_fname):
+   def __init__(self,run_fname,data_fname,param_fname,result_fname):
       """
       We need a run object holding information about the nr. of support points
       etc.
@@ -62,6 +67,41 @@ class PipelineHeuristic:
       run = cPickle.load(open(run_fname))
       self.run = run
 
+      start = cpu()
+      # old version
+      #self.all_remapped_reads = parse_map_vm_heuristic(data_fname)
+      self.all_remapped_reads = BracketWrapper(data_fname)
+      stop = cpu()
+
+      print 'parsed %d reads in %f sec' % (len(self.all_remapped_reads),stop-start)
+
+      #g_dir    = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/contigs'
+      #acc_dir  = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/splice_scores/acc'
+      #don_dir  = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/splice_scores/don'
+
+      #g_fmt = 'contig%d.dna.flat'
+      #s_fmt = 'contig_%d%s'
+
+      #self.chromo_range = range(0,1099)
+
+      g_dir    = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
+      acc_dir  = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
+      don_dir  = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
+
+      g_fmt = 'chr%d.dna.flat'
+      s_fmt = 'contig_%d%s'
+
+      self.chromo_range = range(1,6)
+
+      start = cpu()
+      self.lt1 = LookupTable(g_dir,acc_dir,don_dir,g_fmt,s_fmt,self.chromo_range)
+      stop = cpu()
+
+      print 'prefetched sequence and splice data in %f sec' % (stop-start)
+
+      self.result_spliced_fh = open('%s.spliced'%result_fname,'w+')
+      self.result_unspliced_fh = open('%s.unspliced'%result_fname,'w+')
+
       start = cpu()
 
       self.data_fname = data_fname
@@ -77,19 +117,42 @@ class PipelineHeuristic:
       self.mmatrix = mmatrix
       self.qualityPlifs = qualityPlifs
 
-      # when we look for alternative alignments with introns this value is the
-      # mean of intron size
-      self.intron_size  = 150
-
-      self.read_size    = 36
+      self.read_size    = 38
+
+      # parameters of the heuristics to decide whether the read is spliced
+      #self.splice_thresh      = 0.005
+      self.splice_thresh      = 0.01
+      self.max_intron_size    = 2000
+      self.max_mismatch       = 2 
+      #self.splice_stop_thresh = 0.99
+      self.splice_stop_thresh = 0.8
+      self.spliced_bias       = 0.0
+
+      param_lines = [\
+      ('%f','splice_thresh',self.splice_thresh),\
+      ('%d','max_intron_size',self.max_intron_size),\
+      ('%d','max_mismatch',self.max_mismatch),\
+      ('%f','splice_stop_thresh',self.splice_stop_thresh),\
+      ('%f','spliced_bias',self.spliced_bias)]
+
+      param_lines = [('# %s: '+form+'\n')%(name,val) for form,name,val in param_lines]
+      param_lines.append('# data: %s\n'%self.data_fname)
+      param_lines.append('# param: %s\n'%param_fname)
+
+      #pdb.set_trace()
+
+      for p_line in param_lines:
+         self.result_spliced_fh.write(p_line)
+         self.result_unspliced_fh.write(p_line)
    
-      self.original_reads = {}
+      #self.original_reads = {}
 
-      for line in open('/fml/ag-raetsch/share/projects/qpalma/solexa/allReads.pipeline'):
-         line = line.strip()
-         id,seq,q1,q2,q3 = line.split()
-         id = int(id)
-         self.original_reads[id] = seq
+      # we do not have this information
+      #for line in open(reads_pipeline_fn):
+      #   line = line.strip()
+      #   id,seq,q1,q2,q3 = line.split()
+      #   id = int(id)
+      #   self.original_reads[id] = seq
 
       lengthSP    = run['numLengthSuppPoints']
       donSP       = run['numDonSuppPoints']
@@ -125,7 +188,6 @@ class PipelineHeuristic:
       self.alternativeScoresTime = 0.0
 
       self.count_time = 0.0
-      self.read_parsing = 0.0
       self.main_loop = 0.0
       self.splice_site_time = 0.0
       self.computeSpliceAlignWithQualityTime = 0.0
@@ -142,14 +204,6 @@ class PipelineHeuristic:
       """
       run = self.run 
 
-      start = cpu()
-
-      rrp = PipelineReadParser(self.data_fname)
-      all_remapped_reads = rrp.parse()
-
-      stop = cpu()
-
-      self.read_parsing = stop-start
 
       ctr = 0
       unspliced_ctr  = 0
@@ -158,99 +212,124 @@ class PipelineHeuristic:
       print 'Starting filtering...'
       _start = cpu()
 
-      for readId,currentReadLocations in all_remapped_reads.items():
-         for location in currentReadLocations[:1]:
+      #for readId,currentReadLocations in all_remapped_reads.items():
+         #for location in currentReadLocations[:1]:
 
-            id       = location['id']
-            chr      = location['chr']
-            pos      = location['pos']
-            strand   = location['strand']
-            mismatch = location['mismatches']
-            length   = location['length']
-            off      = location['offset']
-            seq      = location['seq']
-            prb      = location['prb']
-            cal_prb  = location['cal_prb']
-            chastity = location['chastity']
+      for location,original_line in self.all_remapped_reads:
+         
+         if ctr % 1000 == 0:
+            print ctr
+
+         id       = location['id']
+         chr      = location['chr']
+         pos      = location['pos']
+         strand   = location['strand']
+         #mismatch = location['mismatches']
+         #length   = location['length']
+         #off      = location['offset']
+         seq      = location['seq']
+         prb      = location['prb']
+         #cal_prb  = location['cal_prb']
+         #chastity = location['chastity']
 
-            id = int(id)
+         id = int(id)
 
-            if strand == '-':
-               continue
+         seq = seq.lower()
 
-            if ctr == 100:
-               break
+         strand_map = {'D':'+', 'P':'-'}
 
-            #if pos > 10000000:
-            #   continue
-      
-            unb_seq = unbracket_est(seq)
-            effective_len = len(unb_seq)
+         strand = strand_map[strand]
+
+         if not chr in self.chromo_range:
+            continue
+
+         unb_seq = unbracket_seq(seq)
 
-            genomicSeq_start  = pos
-            genomicSeq_stop   = pos+effective_len-1
+         # forgot to do this
+         if strand == '-':
+            #pos = self.lt1.seqInfo.chromo_sizes[chr]-pos-self.read_size
+            unb_seq = reverse_complement(unb_seq)
 
-            start = cpu()
-            #print genomicSeq_start,genomicSeq_stop
-            currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,run['dna_flat_files'])
-            stop = cpu()
-            self.get_time += stop-start
+         effective_len = len(unb_seq)
 
-            dna            = currentDNASeq
-            exons          = zeros((2,1))
-            exons[0,0]     = 0
-            exons[1,0]     = effective_len
-            est            = unb_seq
-            original_est   = seq
-            quality        = prb
+         genomicSeq_start  = pos
+         #genomicSeq_stop   = pos+effective_len-1
+         genomicSeq_stop   = pos+effective_len
 
-            #pdb.set_trace()
+         start = cpu()
+         #currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,run['dna_flat_files'])
+         currentDNASeq, currentAcc, currentDon = self.lt1.get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,run['dna_flat_files'])
 
-            currentVMatchAlignment = dna, exons, est, original_est, quality,\
-            currentAcc, currentDon
-            vMatchScore = self.calcAlignmentScore(currentVMatchAlignment)
+         stop = cpu()
+         self.get_time += stop-start
 
+         dna            = currentDNASeq
+         exons          = zeros((2,1))
+         exons[0,0]     = 0
+         exons[1,0]     = effective_len
+         est            = unb_seq
+         original_est   = seq
+         quality        = map(lambda x:ord(x)-64,prb)
+
+         #pdb.set_trace()
+
+         currentVMatchAlignment = dna, exons, est, original_est, quality,\
+         currentAcc, currentDon
+
+         try:
             alternativeAlignmentScores = self.calcAlternativeAlignments(location)
+         except:
+            alternativeAlignmentScores = []
+
 
-            start = cpu()
-            # found no alternatives
-            if alternativeAlignmentScores == []:
-               continue
-            
-            maxAlternativeAlignmentScore = max(alternativeAlignmentScores)
-            #print 'vMatchScore/alternativeScore: %f %f ' % (vMatchScore,maxAlternativeAlignmentScore)
-            #print 'all candidates %s' % str(alternativeAlignmentScores)
-
-            new_id = id - 1000000300000
-
-            unspliced = False
-            # unspliced 
-            if new_id > 0: 
-               unspliced = True
-
-            # Seems that according to our learned parameters VMatch found a good
-            # alignment of the current read
-            if maxAlternativeAlignmentScore < vMatchScore:
-               unspliced_ctr += 1
-
-               if unspliced:
-                  self.true_neg += 1
-               else:
-                  self.false_neg += 1
-
-            # We found an alternative alignment considering splice sites that scores
-            # higher than the VMatch alignment
+         if alternativeAlignmentScores == []:
+             # no alignment necessary
+             maxAlternativeAlignmentScore = -inf
+             vMatchScore = 0.0 
+         else:
+             maxAlternativeAlignmentScore = max(alternativeAlignmentScores)
+             # compute alignment for vmatch unspliced read
+             vMatchScore = self.calcAlignmentScore(currentVMatchAlignment)
+
+         start = cpu()
+         
+         #print 'vMatchScore/alternativeScore: %f %f ' % (vMatchScore,maxAlternativeAlignmentScore)
+         #print 'all candidates %s' % str(alternativeAlignmentScores)
+
+         new_id = id - 1000000300000
+
+         unspliced = False
+         # unspliced 
+         if new_id > 0: 
+            unspliced = True
+
+         # Seems that according to our learned parameters VMatch found a good
+         # alignment of the current read
+         if maxAlternativeAlignmentScore < vMatchScore:
+            unspliced_ctr += 1
+
+            self.result_unspliced_fh.write(original_line+'\n') 
+
+            if unspliced:
+               self.true_neg += 1
             else:
-               spliced_ctr += 1
+               self.false_neg += 1
+
+         # We found an alternative alignment considering splice sites that scores
+         # higher than the VMatch alignment
+         else:
+            spliced_ctr += 1
 
-               if unspliced:
-                  self.false_pos += 1
-               else:
-                  self.true_pos += 1
+            self.result_spliced_fh.write(original_line+'\n')
+
+            if unspliced:
+               self.false_pos += 1
+            else:
+               self.true_pos += 1
 
-            ctr += 1
-            stop = cpu()
-            self.count_time = stop-start
+         ctr += 1
+         stop = cpu()
+         self.count_time = stop-start
 
       _stop = cpu()
       self.main_loop = _stop-_start
@@ -260,30 +339,50 @@ class PipelineHeuristic:
       print 'True neg / false neg : %d %d'%(self.true_neg,self.false_neg)
 
 
-   def findHighestScoringSpliceSites(self,currentAcc,currentDon):
+   def findHighestScoringSpliceSites(self, currentAcc, currentDon, DNA, max_intron_size, read_size, splice_thresh):
 
-      acc   = []
-      for idx,score in enumerate(currentAcc):
-         if score > -inf:
-            acc.append((idx,score))
-         if idx>self.read_size:
-             break 
+      def signum(a):
+          if a>0: 
+              return 1
+         elif a<0:
+              return -1
+          else:
+              return 0
+
+      proximal_acc   = []
+      for idx in xrange(max_intron_size, max_intron_size+read_size/2):
+          if currentAcc[idx]>= splice_thresh:
+            proximal_acc.append((idx,currentAcc[idx]))
+
+      proximal_acc.sort(lambda x,y: signum(x[1]-y[1])) 
+      proximal_acc=proximal_acc[-2:]
+
+      distal_acc   = []
+      for idx in xrange(max_intron_size+read_size, len(currentAcc)):
+          if currentAcc[idx]>= splice_thresh and idx+read_size<len(currentAcc):
+            distal_acc.append((idx, currentAcc[idx], DNA[idx+1:idx+read_size]))
+
+      #distal_acc.sort(lambda x,y: signum(x[1]-y[1])) 
+      #distal_acc=distal_acc[-2:]
 
-      acc.sort(lambda x,y: x[0]-y[0]) 
-      acc=acc[-2:]
-      
-      don   = []
-      for idx,score in enumerate(currentDon):
-         if score > -inf:
-            don.append((idx,score))
-         if idx>self.read_size:
-             break 
 
-      don.sort(lambda x,y: x[0]-y[0]) 
-      don=don[-2:]
+      proximal_don   = []
+      for idx in xrange(max_intron_size+read_size/2, max_intron_size+read_size):
+         if currentDon[idx] >= splice_thresh:
+            proximal_don.append((idx, currentDon[idx]))
 
-      return don,acc
+      proximal_don.sort(lambda x,y: signum(x[1]-y[1]))
+      proximal_don=proximal_don[-2:]
 
+      distal_don   = []
+      for idx in xrange(1, max_intron_size):
+         if currentDon[idx] >= splice_thresh and idx>read_size:
+            distal_don.append((idx, currentDon[idx], DNA[idx-read_size:idx]))
+
+      distal_don.sort(lambda x,y: y[0]-x[0])
+      #distal_don=distal_don[-2:]
+
+      return proximal_acc,proximal_don,distal_acc,distal_don
 
    def calcAlternativeAlignments(self,location):
       """
@@ -300,24 +399,28 @@ class PipelineHeuristic:
       strand   = location['strand']
       original_est = location['seq']
       quality      = location['prb']
-      cal_prb  = location['cal_prb']
+      quality        = map(lambda x:ord(x)-64,quality)
+      #cal_prb  = location['cal_prb']
       
-      est = unbracket_est(original_est)
+      original_est = original_est.lower()
+      est = unbracket_seq(original_est)
       effective_len = len(est)
 
-      genomicSeq_start  = pos
-      genomicSeq_stop   = pos+self.intron_size*2+self.read_size*2
+      genomicSeq_start  = pos - self.max_intron_size
+      genomicSeq_stop   = pos + self.max_intron_size + len(est)
+
+      strand_map = {'D':'+', 'P':'-'}
+      strand = strand_map[strand]
 
       start = cpu()
-      currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,run['dna_flat_files'])
+      #currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chr, strand, genomicSeq_start, genomicSeq_stop, run['dna_flat_files'])
+      currentDNASeq, currentAcc, currentDon = self.lt1.get_seq_and_scores(chr, strand, genomicSeq_start, genomicSeq_stop, run['dna_flat_files'])
       stop = cpu()
       self.get_time += stop-start
       dna   = currentDNASeq
-
-      alt_don,alt_acc = self.findHighestScoringSpliceSites(currentAcc,currentDon)
-      #print alt_don
-      #print alt_acc
-      
+   
+      proximal_acc,proximal_don,distal_acc,distal_don = self.findHighestScoringSpliceSites(currentAcc, currentDon, dna, self.max_intron_size, len(est), self.splice_thresh)
+       
       alternativeScores = []
       
       # inlined
@@ -328,113 +431,166 @@ class PipelineHeuristic:
       qualityPlifs = self.qualityPlifs
       # inlined
 
-      # compute dummy scores
-      IntronScore = calculatePlif(h, [self.intron_size+1])[0]
-      #print IntronScore
-      dummyAcceptorScore = calculatePlif(a, [0.25])[0] 
-      dummyDonorScore = calculatePlif(d, [0.25])[0]
-      
+      # find an intron on the 3' end
       _start = cpu()
-      for (don_pos,don_score) in alt_don:
-         # remove mismatching positions in the second exon
-         original_est_cut=''
-
-         est_ptr=0
-         dna_ptr=0
-         ptr=0 
-         while ptr<len(original_est):
-             
-            if original_est[ptr]=='[':
-                dnaletter=original_est[ptr+1]
-                estletter=original_est[ptr+2]
-                if dna_ptr < don_pos:
-                    original_est_cut+=original_est[ptr:ptr+4] 
-                else:
-                    original_est_cut+=estletter # EST letter
-                ptr+=4 
-            else:
-                dnaletter=original_est[ptr]
-                estletter=dnaletter
+      for (don_pos,don_score) in proximal_don:
+          DonorScore = calculatePlif(d, [don_score])[0]
+
+          for (acc_pos,acc_score,acc_dna) in distal_acc:
+
+              IntronScore = calculatePlif(h, [acc_pos-don_pos])[0] 
+              AcceptorScore = calculatePlif(a, [acc_score])[0] 
+          
+              #print 'don splice: ', (don_pos,don_score), (acc_pos,acc_score,acc_dna), (DonorScore,IntronScore,AcceptorScore)
+
+              # construct a new "original_est"
+              original_est_cut=''
+              
+              est_ptr=0
+              dna_ptr=self.max_intron_size
+              ptr=0
+              acc_dna_ptr=0
+              num_mismatch = 0
+              
+              while ptr<len(original_est):
+                  #print acc_dna_ptr,len(acc_dna),acc_pos,don_pos
+                  
+                  if original_est[ptr]=='[':
+                      dnaletter=original_est[ptr+1]
+                      estletter=original_est[ptr+2]
+                      if dna_ptr < don_pos:
+                          original_est_cut+=original_est[ptr:ptr+4]
+                          num_mismatch += 1
+                      else:
+                          if acc_dna[acc_dna_ptr]==estletter:
+                              original_est_cut += estletter # EST letter
+                          else:
+                              original_est_cut += '['+acc_dna[acc_dna_ptr]+estletter+']' # EST letter
+                              num_mismatch += 1
+                              #print '['+acc_dna[acc_dna_ptr]+estletter+']' 
+                          acc_dna_ptr+=1 
+                      ptr+=4 
+                  else:
+                      dnaletter=original_est[ptr]
+                      estletter=dnaletter
                 
-                original_est_cut+=estletter # EST letter
-                ptr+=1
-
-            if estletter=='-':
-                dna_ptr+=1 
-            elif dnaletter=='-':
-                est_ptr+=1
-            else:
-                dna_ptr+=1 
-                est_ptr+=1
+                      if dna_ptr < don_pos:
+                          original_est_cut+=estletter # EST letter
+                      else:
+                          if acc_dna[acc_dna_ptr]==estletter:
+                              original_est_cut += estletter # EST letter
+                          else:
+                              num_mismatch += 1
+                              original_est_cut += '['+acc_dna[acc_dna_ptr]+estletter+']' # EST letter
+                              #print '('+acc_dna[acc_dna_ptr]+estletter+')' 
+                          acc_dna_ptr+=1 
+                          
+                      ptr+=1
+
+                  if estletter=='-':
+                      dna_ptr+=1 
+                  elif dnaletter=='-':
+                      est_ptr+=1
+                  else:
+                      dna_ptr+=1 
+                      est_ptr+=1
+              if num_mismatch>self.max_mismatch:
+                  continue
                          
-         assert(dna_ptr<=len(dna))
-         assert(est_ptr<=len(est))
+              assert(dna_ptr<=len(dna))
+              assert(est_ptr<=len(est))
 
-         #print "Donor"
-         DonorScore = calculatePlif(d, [don_score])[0]
-         #print DonorScore,don_score,don_pos
-         
-         score = computeSpliceAlignScoreWithQuality(original_est_cut, quality, qualityPlifs, run, self.currentPhi)
-         score += dummyAcceptorScore + IntronScore + DonorScore
+              #print original_est, original_est_cut              
+
+              score = computeSpliceAlignScoreWithQuality(original_est_cut, quality, qualityPlifs, run, self.currentPhi)
+              score += AcceptorScore + IntronScore + DonorScore + self.spliced_bias 
          
-         #print 'diff %f,%f,%f' % ((trueWeight.T * self.currentPhi)[0,0] - score,(trueWeight.T * self.currentPhi)[0,0], score)
-         alternativeScores.append(score)
+              alternativeScores.append(score)
 
+              if acc_score>=self.splice_stop_thresh:
+                  break
+              
       _stop = cpu()
       self.alternativeScoresTime += _stop-_start
 
+      # find an intron on the 5' end 
       _start = cpu()
-      for (acc_pos,acc_score) in alt_acc:
-         # remove mismatching positions in the second exon
-         original_est_cut=''
-
-         est_ptr=0
-         dna_ptr=0
-         ptr=0 
-         while ptr<len(original_est):
-             
-            if original_est[ptr]=='[':
-                dnaletter=original_est[ptr+1]
-                estletter=original_est[ptr+2]
-                if est_ptr>=acc_pos:
-                    original_est_cut+=original_est[ptr:ptr+4] 
-                else:
-                    original_est_cut+=estletter # EST letter
-                ptr+=4 
-            else:
-                dnaletter=original_est[ptr]
-                estletter=dnaletter
+      for (acc_pos,acc_score) in proximal_acc:
+
+          AcceptorScore = calculatePlif(a, [acc_score])[0]
+          
+          for (don_pos,don_score,don_dna) in distal_don:
+
+              DonorScore = calculatePlif(d, [don_score])[0]
+              IntronScore = calculatePlif(h, [acc_pos-don_pos])[0] 
+          
+              #print 'acc splice: ', (don_pos,don_score,don_dna), (acc_pos,acc_score), (DonorScore,IntronScore,AcceptorScore)
+
+              # construct a new "original_est"
+              original_est_cut=''
+              
+              est_ptr=0
+              dna_ptr=self.max_intron_size
+              ptr=0
+              num_mismatch = 0
+              don_dna_ptr=len(don_dna)-(acc_pos-self.max_intron_size)-1
+              while ptr<len(original_est):
+                  
+                  if original_est[ptr]=='[':
+                      dnaletter=original_est[ptr+1]
+                      estletter=original_est[ptr+2]
+                      if dna_ptr > acc_pos:
+                          original_est_cut+=original_est[ptr:ptr+4] 
+                          num_mismatch += 1
+                      else:
+                          if don_dna[don_dna_ptr]==estletter:
+                              original_est_cut += estletter # EST letter
+                          else:
+                              original_est_cut += '['+don_dna[don_dna_ptr]+estletter+']' # EST letter
+                              num_mismatch += 1
+                              #print '['+don_dna[don_dna_ptr]+estletter+']' 
+                          don_dna_ptr+=1 
+                      ptr+=4 
+                  else:
+                      dnaletter=original_est[ptr]
+                      estletter=dnaletter
                 
-                original_est_cut+=estletter # EST letter
-                ptr+=1
-
-            if estletter=='-':
-                dna_ptr+=1 
-            elif dnaletter=='-':
-                est_ptr+=1
-            else:
-                dna_ptr+=1 
-                est_ptr+=1
+                      if dna_ptr > acc_pos:
+                          original_est_cut+=estletter # EST letter
+                      else:
+                          if don_dna[don_dna_ptr]==estletter:
+                              original_est_cut += estletter # EST letter
+                          else:
+                              original_est_cut += '['+don_dna[don_dna_ptr]+estletter+']' # EST letter
+                              num_mismatch += 1
+                              #print '('+don_dna[don_dna_ptr]+estletter+')' 
+                          don_dna_ptr+=1 
+                          
+                      ptr+=1
+
+                  if estletter=='-':
+                      dna_ptr+=1 
+                  elif dnaletter=='-':
+                      est_ptr+=1
+                  else:
+                      dna_ptr+=1 
+                      est_ptr+=1
                          
-         assert(dna_ptr<=len(dna))
-         assert(est_ptr<=len(est))
+              if num_mismatch>self.max_mismatch:
+                  continue
+              assert(dna_ptr<=len(dna))
+              assert(est_ptr<=len(est))
 
-         #print original_est,original_est_cut
-         
-         AcceptorScore = calculatePlif(d, [acc_score])[0]
-         #print "Acceptor"
-         #print AcceptorScore,acc_score,acc_pos
+              #print original_est, original_est_cut              
 
-         #if acc_score<0.1:
-         #    print currentAcc[0:50]
-         #    print currentDon[0:50]
-         
-         score = computeSpliceAlignScoreWithQuality(original_est_cut, quality, qualityPlifs, run, self.currentPhi)
-         score += AcceptorScore + IntronScore + dummyDonorScore
+              score = computeSpliceAlignScoreWithQuality(original_est_cut, quality, qualityPlifs, run, self.currentPhi)
+              score += AcceptorScore + IntronScore + DonorScore + self.spliced_bias
          
-         #print 'diff %f,%f,%f' % ((trueWeight.T * self.currentPhi)[0,0] - score,(trueWeight.T * self.currentPhi)[0,0], score)
-         alternativeScores.append(score)
+              alternativeScores.append(score)
 
+              if don_score>=self.splice_stop_thresh:
+                  break
+              
       _stop = cpu()
       self.alternativeScoresTime += _stop-_start
 
@@ -468,34 +624,31 @@ def cpu():
 
 
 if __name__ == '__main__':
-   #run_fname = sys.argv[1]
-   #data_fname = sys.argv[2]
-   #param_filename = sys.argv[3]
-
-   dir = '/fml/ag-raetsch/home/fabio/tmp/QPalma_test/run_+_quality_+_splicesignals_+_intron_len'
-   jp = os.path.join
+   if len(sys.argv) != 6:
+      print 'Usage: ./%s data param run spliced.results unspliced.results' % (sys.argv[0])
+      sys.exit(1)
 
-   run_fname   = jp(dir,'run_object.pickle')
-   #data_fname = '/fml/ag-raetsch/share/projects/qpalma/solexa/current_data/map.vm_unspliced_flag'
+   data_fname = sys.argv[1]
+   param_fname = sys.argv[2]
+   run_fname = sys.argv[3]
 
-   #data_fname  = '/fml/ag-raetsch/share/projects/qpalma/solexa/pipeline_data/map.vm_2k'
-   data_fname  = '/fml/ag-raetsch/share/projects/qpalma/solexa/pipeline_data/map.vm_100'
+   result_spliced_fname    = sys.argv[4]
+   result_unspliced_fname  = sys.argv[5]
 
-   param_fname = jp(dir,'param_500.pickle')
+   jp = os.path.join
 
-   ph1 = PipelineHeuristic(run_fname,data_fname,param_fname)
+   ph1 = PipelineHeuristic(run_fname,data_fname,param_fname,result_spliced_fname,result_unspliced_fname)
 
    start = cpu()
    ph1.filter()
    stop = cpu()
-
    print 'total time elapsed: %f' % (stop-start)
+
    print 'time spend for get seq: %f' % ph1.get_time
    print 'time spend for calcAlignmentScoreTime: %f' %  ph1.calcAlignmentScoreTime
    print 'time spend for alternativeScoresTime: %f' % ph1.alternativeScoresTime
    print 'time spend for count time: %f' % ph1.count_time
    print 'time spend for init time: %f' % ph1.init_time
-   print 'time spend for read_parsing time: %f' % ph1.read_parsing
    print 'time spend for main_loop time: %f' % ph1.main_loop
    print 'time spend for splice_site_time time: %f' % ph1.splice_site_time
 
@@ -503,9 +656,3 @@ if __name__ == '__main__':
    print 'time spend for computeSpliceWeightsTime time: %f'% ph1.computeSpliceWeightsTime
    print 'time spend for DotProdTime time: %f'% ph1.DotProdTime
    print 'time spend forarray_stuff time: %f'% ph1.array_stuff
-   #import cProfile
-   #cProfile.run('ph1.filter()')
-
-   #import hotshot
-   #p = hotshot.Profile('profile.log')
-   #p.runcall(ph1.filter)