+ minor changes in the paths
[qpalma.git] / scripts / PipelineHeuristic.py
index 6979d23..f4050bc 100644 (file)
@@ -3,49 +3,53 @@
 
 import cPickle
 import sys
-import pydb
 import pdb
 import os
 import os.path
-import math
+import resource
 
-from qpalma.DataProc import *
 from qpalma.computeSpliceWeights import *
 from qpalma.set_param_palma import *
 from qpalma.computeSpliceAlignWithQuality import *
-from qpalma.penalty_lookup_new import *
-from qpalma.compute_donacc import *
-from qpalma.TrainingParam import Param
-from qpalma.Plif import Plf
 
-from qpalma.tools.splicesites import getDonAccScores
-from qpalma.Configuration import *
+from numpy.matlib import mat,zeros,ones,inf
+from numpy import inf,mean
 
-from compile_dataset import getSpliceScores, get_seq_and_scores
+from ParaParser import ParaParser,IN_VECTOR
 
+from qpalma.Lookup import LookupTable
+from qpalma.sequence_utils import SeqSpliceInfo,DataAccessWrapper,reverse_complement,unbracket_seq
 
-from numpy.matlib import mat,zeros,ones,inf
-from numpy import inf
 
-from qpalma.parsers import PipelineReadParser
+def cpu():
+   return (resource.getrusage(resource.RUSAGE_SELF).ru_utime+\
+   resource.getrusage(resource.RUSAGE_SELF).ru_stime) 
 
 
-def unbracket_est(est):
-   new_est = ''
-   e = 0
+class BracketWrapper:
+   fields = ['id', 'chr', 'pos', 'strand', 'seq', 'prb']
 
-   while True:
-      if e >= len(est):
-         break
+   def __init__(self,filename):
+      self.parser = ParaParser("%lu%d%d%s%s%s",self.fields,len(self.fields),IN_VECTOR)
+      self.parser.parseFile(filename)
+
+   def __len__(self):
+      return self.parser.getSize(IN_VECTOR)
+      
+   def __getitem__(self,key):
+      return self.parser.fetchEntry(key)
 
-      if est[e] == '[':
-         new_est += est[e+2]
-         e += 4
-      else:
-         new_est += est[e]
-         e += 1
+   def __iter__(self):
+      self.counter = 0
+      self.size = self.parser.getSize(IN_VECTOR)
+      return self
 
-   return "".join(new_est).lower()
+   def next(self):
+      if not self.counter < self.size:
+         raise StopIteration
+      return_val = self.parser.fetchEntry(self.counter)
+      self.counter += 1
+      return return_val
 
 
 class PipelineHeuristic:
@@ -54,7 +58,7 @@ class PipelineHeuristic:
    vmatch is spliced an should be then newly aligned using QPalma or not.
    """
 
-   def __init__(self,run_fname,data_fname,param_fname):
+   def __init__(self,run_fname,data_fname,param_fname,result_fname):
       """
       We need a run object holding information about the nr. of support points
       etc.
@@ -63,6 +67,46 @@ class PipelineHeuristic:
       run = cPickle.load(open(run_fname))
       self.run = run
 
+      start = cpu()
+      self.all_remapped_reads = BracketWrapper(data_fname)
+      stop = cpu()
+
+      print 'parsed %d reads in %f sec' % (len(self.all_remapped_reads),stop-start)
+
+      #g_dir    = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/contigs'
+      #acc_dir  = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/splice_scores/acc'
+      #don_dir  = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/splice_scores/don'
+
+      #g_fmt = 'contig%d.dna.flat'
+      #s_fmt = 'contig_%d%s'
+
+      #self.chromo_range = range(0,1099)
+
+      g_dir    = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
+      acc_dir  = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
+      don_dir  = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
+
+      g_fmt = 'chr%d.dna.flat'
+      s_fmt = 'contig_%d%s'
+
+      self.chromo_range = range(1,6)
+      #self.chromo_range = range(1,2)
+
+      num_chromo = 2
+      accessWrapper = DataAccessWrapper(g_dir,acc_dir,don_dir,g_fmt,s_fmt)
+      self.seqInfo = SeqSpliceInfo(accessWrapper,range(1,num_chromo))
+
+      start = cpu()
+      self.lt1 = LookupTable(g_dir,acc_dir,don_dir,g_fmt,s_fmt,self.chromo_range)
+      stop = cpu()
+
+      print 'prefetched sequence and splice data in %f sec' % (stop-start)
+
+      self.result_spliced_fh = open('%s.spliced'%result_fname,'w+')
+      self.result_unspliced_fh = open('%s.unspliced'%result_fname,'w+')
+
+      start = cpu()
+
       self.data_fname = data_fname
 
       self.param = cPickle.load(open(param_fname))
@@ -76,12 +120,88 @@ class PipelineHeuristic:
       self.mmatrix = mmatrix
       self.qualityPlifs = qualityPlifs
 
-      # when we look for alternative alignments with introns this value is the
-      # mean of intron size
-      self.intron_size  = 90
-
-      self.read_size    = 36
+      self.read_size    = 38
+      self.prb_offset   = 64
+      #self.prb_offset   = 50
+
+      # parameters of the heuristics to decide whether the read is spliced
+      #self.splice_thresh      = 0.005
+      self.splice_thresh      = 0.01
+      self.max_intron_size    = 2000
+      self.max_mismatch       = 2 
+      #self.splice_stop_thresh = 0.99
+      self.splice_stop_thresh = 0.8
+      self.spliced_bias       = 0.0
+
+      param_lines = [\
+      ('%f','splice_thresh',self.splice_thresh),\
+      ('%d','max_intron_size',self.max_intron_size),\
+      ('%d','max_mismatch',self.max_mismatch),\
+      ('%f','splice_stop_thresh',self.splice_stop_thresh),\
+      ('%f','spliced_bias',self.spliced_bias)]
+
+      param_lines = [('# %s: '+form+'\n')%(name,val) for form,name,val in param_lines]
+      param_lines.append('# data: %s\n'%self.data_fname)
+      param_lines.append('# param: %s\n'%param_fname)
+
+      #pdb.set_trace()
+
+      for p_line in param_lines:
+         self.result_spliced_fh.write(p_line)
+         self.result_unspliced_fh.write(p_line)
    
+      #self.original_reads = {}
+
+      # we do not have this information
+      #for line in open(reads_pipeline_fn):
+      #   line = line.strip()
+      #   id,seq,q1,q2,q3 = line.split()
+      #   id = int(id)
+      #   self.original_reads[id] = seq
+
+      lengthSP    = run['numLengthSuppPoints']
+      donSP       = run['numDonSuppPoints']
+      accSP       = run['numAccSuppPoints']
+      mmatrixSP   = run['matchmatrixRows']*run['matchmatrixCols']
+      numq        = run['numQualSuppPoints']
+      totalQualSP = run['totalQualSuppPoints']
+
+      currentPhi = zeros((run['numFeatures'],1))
+      currentPhi[0:lengthSP]                                            = mat(h.penalties[:]).reshape(lengthSP,1)
+      currentPhi[lengthSP:lengthSP+donSP]                               = mat(d.penalties[:]).reshape(donSP,1)
+      currentPhi[lengthSP+donSP:lengthSP+donSP+accSP]                   = mat(a.penalties[:]).reshape(accSP,1)
+      currentPhi[lengthSP+donSP+accSP:lengthSP+donSP+accSP+mmatrixSP]   = mmatrix[:]
+
+      totalQualityPenalties = self.param[-totalQualSP:]
+      currentPhi[lengthSP+donSP+accSP+mmatrixSP:]                    = totalQualityPenalties[:]
+      self.currentPhi = currentPhi
+
+      # we want to identify spliced reads 
+      # so true pos are spliced reads that are predicted "spliced"
+      self.true_pos  = 0
+      
+      # as false positives we count all reads that are not spliced but predicted
+      # as "spliced"
+      self.false_pos = 0
+
+      self.true_neg  = 0
+      self.false_neg = 0
+
+      # total time spend for get seq and scores
+      self.get_time  = 0.0
+      self.calcAlignmentScoreTime = 0.0
+      self.alternativeScoresTime = 0.0
+
+      self.count_time = 0.0
+      self.main_loop = 0.0
+      self.splice_site_time = 0.0
+      self.computeSpliceAlignWithQualityTime = 0.0
+      self.computeSpliceWeightsTime = 0.0
+      self.DotProdTime = 0.0
+      self.array_stuff = 0.0
+      stop = cpu()
+
+      self.init_time = stop-start
 
    def filter(self):
       """
@@ -89,103 +209,186 @@ class PipelineHeuristic:
       """
       run = self.run 
 
-      rrp = PipelineReadParser(self.data_fname)
-      all_remapped_reads = rrp.parse()
-
       ctr = 0
       unspliced_ctr  = 0
       spliced_ctr    = 0
 
       print 'Starting filtering...'
+      _start = cpu()
 
-      for readId,currentReadLocations in all_remapped_reads.items():
-         for location in currentReadLocations:
-            chr      = location['chr']
-            pos      = location['pos']
-            strand   = location['strand']
-            mismatch = location['mismatches']
-            length   = location['length']
-            off      = location['offset']
-            seq      = location['seq']
-            prb      = location['prb']
-            cal_prb  = location['cal_prb']
-            chastity = location['chastity']
-            is_spliced = location['is_spliced']
-
-            if strand == '-':
-               continue
-
-            if ctr == 100:
-               break
-
-            #if pos > 10000000:
-            #   continue
-      
-            unb_seq = unbracket_est(seq)
-            effective_len = len(unb_seq)
+      #for readId,currentReadLocations in all_remapped_reads.items():
+         #for location in currentReadLocations[:1]:
+
+      for location,original_line in self.all_remapped_reads:
+         
+         if ctr % 1000 == 0:
+            print ctr
+
+         id       = location['id']
+         chr      = location['chr']
+         pos      = location['pos']
+         strand   = location['strand']
+         seq      = location['seq']
+         prb      = location['prb']
 
-            genomicSeq_start  = pos
-            genomicSeq_stop   = pos+effective_len-1
+         id = int(id)
 
-            #print genomicSeq_start,genomicSeq_stop
-            currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,run['dna_flat_files'])
+         seq = seq.lower()
 
-            dna            = currentDNASeq
-            exons          = zeros((2,1))
-            exons[0,0]     = 0
-            exons[1,0]     = effective_len
-            est            = unb_seq
-            original_est   = seq
-            quality        = prb
+         strand_map = {'D':'+', 'P':'-'}
 
-            #pdb.set_trace()
+         strand = strand_map[strand]
+
+         if not chr in self.chromo_range:
+            continue
+
+         unb_seq = unbracket_seq(seq)
+
+         # forgot to do this
+         if strand == '-':
+            #pos = self.lt1.seqInfo.chromo_sizes[chr]-pos-self.read_size
+            unb_seq = reverse_complement(unb_seq)
+            seq = reverse_complement(seq)
+
+         effective_len = len(unb_seq)
+
+         genomicSeq_start  = pos
+         #genomicSeq_stop   = pos+effective_len-1
+         genomicSeq_stop   = pos+effective_len
+
+         start = cpu()
+         #currentDNASeq_, currentAcc_, currentDon_ = self.seqInfo.get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,False)
+         currentDNASeq, currentAcc, currentDon = self.lt1.get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop)
+
+         # just performed some tests to check LookupTable results
+         #assert currentDNASeq_ == currentDNASeq
+         #assert currentAcc_ == currentAcc_
+         #assert currentDon_ == currentDon_
+
+         stop = cpu()
+         self.get_time += stop-start
+
+         dna            = currentDNASeq
+         exons          = zeros((2,1))
+         exons[0,0]     = 0
+         exons[1,0]     = effective_len
+         est            = unb_seq
+         original_est   = seq
+         quality        = map(lambda x:ord(x)-self.prb_offset,prb)
+
+         currentVMatchAlignment = dna, exons, est, original_est, quality,\
+         currentAcc, currentDon
+
+         #pdb.set_trace()
 
-            currentVMatchAlignment = dna, exons, est, original_est, quality,\
-            currentAcc, currentDon
-            vMatchScore = self.calcAlignmentScore(currentVMatchAlignment)
+         #alternativeAlignmentScores = self.calcAlternativeAlignments(location)
 
+         try:
             alternativeAlignmentScores = self.calcAlternativeAlignments(location)
+         except:
+            alternativeAlignmentScores = []
+
+         if alternativeAlignmentScores == []:
+             # no alignment necessary
+             maxAlternativeAlignmentScore = -inf
+             vMatchScore = 0.0 
+         else:
+             maxAlternativeAlignmentScore = max(alternativeAlignmentScores)
+             # compute alignment for vmatch unspliced read
+             vMatchScore = self.calcAlignmentScore(currentVMatchAlignment)
+             #print 'vMatchScore/alternativeScore: %f %f %s' % (vMatchScore,maxAlternativeAlignmentScore,strand)
+             
+
+         start = cpu()
+         
+         #print 'all candidates %s' % str(alternativeAlignmentScores)
+
+         new_id = id - 1000000300000
+
+         unspliced = False
+         # unspliced 
+         if new_id > 0: 
+            unspliced = True
+
+         # Seems that according to our learned parameters VMatch found a good
+         # alignment of the current read
+         if maxAlternativeAlignmentScore < vMatchScore:
+            unspliced_ctr += 1
+
+            self.result_unspliced_fh.write(original_line+'\n') 
+
+            if unspliced:
+               self.true_neg += 1
+            else:
+               self.false_neg += 1
+
+         # We found an alternative alignment considering splice sites that scores
+         # higher than the VMatch alignment
+         else:
+            spliced_ctr += 1
 
-            # found no alternatives
-            if alternativeAlignmentScores == []:
-               continue
-            
-            maxAlternativeAlignmentScore = max(alternativeAlignmentScores)
-            #print 'vMatchScore/alternativeScore: %f %f ' % (vMatchScore,maxAlternativeAlignmentScore)
-            #print 'all candidates %s' % str(alternativeAlignmentScores)
-
-            # Seems that according to our learned parameters VMatch found a good
-            # alignment of the current read
-            if maxAlternativeAlignmentScore < vMatchScore:
-               unspliced_ctr += 1
-            # We found an alternative alignment considering splice sites that scores
-            # higher than the VMatch alignment
+            self.result_spliced_fh.write(original_line+'\n')
+
+            if unspliced:
+               self.false_pos += 1
             else:
-               spliced_ctr += 1
+               self.true_pos += 1
+
+         ctr += 1
+         stop = cpu()
+         self.count_time = stop-start
 
-            ctr += 1
+      _stop = cpu()
+      self.main_loop = _stop-_start
 
       print 'Unspliced/Splice: %d %d'%(unspliced_ctr,spliced_ctr)
+      print 'True pos / false pos : %d %d'%(self.true_pos,self.false_pos)
+      print 'True neg / false neg : %d %d'%(self.true_neg,self.false_neg)
+
+
+   def findHighestScoringSpliceSites(self, currentAcc, currentDon, DNA, max_intron_size, read_size, splice_thresh):
+
+      def signum(a):
+          if a>0: 
+              return 1
+         elif a<0:
+              return -1
+          else:
+              return 0
+
+      proximal_acc   = []
+      for idx in xrange(max_intron_size, max_intron_size+read_size/2):
+          if currentAcc[idx]>= splice_thresh:
+            proximal_acc.append((idx,currentAcc[idx]))
 
+      proximal_acc.sort(lambda x,y: signum(x[1]-y[1])) 
+      proximal_acc=proximal_acc[-2:]
 
-   def findHighestScoringSpliceSites(self,currentAcc,currentDon):
-      max_don  = -inf
-      don_pos  = []
-      for idx,score in enumerate(currentDon):
-         if score > -inf and idx > 1 and idx < self.read_size:
-            don_pos.append(idx)
+      distal_acc   = []
+      for idx in xrange(max_intron_size+read_size, len(currentAcc)):
+          if currentAcc[idx]>= splice_thresh and idx+read_size<len(currentAcc):
+            distal_acc.append((idx, currentAcc[idx], DNA[idx+1:idx+read_size]))
 
-         if len(don_pos) == 2:
-            break
+      #distal_acc.sort(lambda x,y: signum(x[1]-y[1])) 
+      #distal_acc=distal_acc[-2:]
 
-      max_acc  = -inf
-      acc_pos  = []
-      for idx,score in enumerate(currentAcc):
-         if score > -inf and idx > self.intron_size:
-            acc_pos = idx
-            break
+      proximal_don   = []
+      for idx in xrange(max_intron_size+read_size/2, max_intron_size+read_size):
+         if currentDon[idx] >= splice_thresh:
+            proximal_don.append((idx, currentDon[idx]))
 
-      return don_pos,acc_pos
+      proximal_don.sort(lambda x,y: signum(x[1]-y[1]))
+      proximal_don=proximal_don[-2:]
+
+      distal_don   = []
+      for idx in xrange(1, max_intron_size):
+         if currentDon[idx] >= splice_thresh and idx>read_size:
+            distal_don.append((idx, currentDon[idx], DNA[idx-read_size:idx]))
+
+      distal_don.sort(lambda x,y: y[0]-x[0])
+      #distal_don=distal_don[-2:]
+
+      return proximal_acc,proximal_don,distal_acc,distal_don
 
 
    def calcAlternativeAlignments(self,location):
@@ -194,55 +397,216 @@ class PipelineHeuristic:
       alternative alignments taking into account for example matched
       donor/acceptor positions.
       """
+
       run = self.run
 
+      id       = location['id']
       chr      = location['chr']
       pos      = location['pos']
       strand   = location['strand']
-      seq      = location['seq']
-      orig_seq = location['orig_seq']
-      prb      = location['prb']
-      cal_prb  = location['cal_prb']
-      is_spliced = location['is_spliced']
+      original_est = location['seq']
+      quality      = location['prb']
+      quality        = map(lambda x:ord(x)-self.prb_offset,quality)
       
-      unb_seq = unbracket_est(seq)
-      effective_len = len(unb_seq)
+      original_est = original_est.lower()
+      est = unbracket_seq(original_est)
+      effective_len = len(est)
 
-      genomicSeq_start  = pos
-      genomicSeq_stop   = pos+self.intron_size*2+self.read_size*2
+      genomicSeq_start  = pos - self.max_intron_size
+      genomicSeq_stop   = pos + self.max_intron_size + len(est)
 
-      currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,run['dna_flat_files'])
-      dna            = currentDNASeq
+      strand_map = {'D':'+', 'P':'-'}
+      strand = strand_map[strand]
 
-      alt_don_pos,acc_pos = self.findHighestScoringSpliceSites(currentAcc,currentDon)
+      if strand == '-':
+         est = reverse_complement(est)
+         original_est = reverse_complement(original_est)
 
-      #print alt_don_pos,acc_pos
+      start = cpu()
+      #currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chr, strand, genomicSeq_start, genomicSeq_stop, run['dna_flat_files'])
+      currentDNASeq, currentAcc, currentDon = self.lt1.get_seq_and_scores(chr, strand, genomicSeq_start, genomicSeq_stop)
+      stop = cpu()
+      self.get_time += stop-start
+      dna   = currentDNASeq
 
+      proximal_acc,proximal_don,distal_acc,distal_don = self.findHighestScoringSpliceSites(currentAcc, currentDon, dna, self.max_intron_size, len(est), self.splice_thresh)
+       
       alternativeScores = []
 
-      for don_pos in alt_don_pos:
-         exons          = zeros((2,2),dtype=numpy.int)
-         exons[0,0]     = 0
-         exons[0,1]     = don_pos
-         exons[1,0]     = acc_pos+1
-         exons[1,1]     = acc_pos+1+(self.read_size-don_pos)
-         est            = unb_seq
-         original_est   = seq
-         quality        = prb
-
-         _dna = dna[:int(exons[1,1])]
-         _dna = _dna[:exons[1,0]] + orig_seq[don_pos:]
-
-         #pdb.set_trace()
-
-         _currentAcc = currentAcc[:int(exons[1,1])]
-         _currentDon = currentDon[:int(exons[1,1])]
-
-         currentVMatchAlignment = _dna, exons, est, original_est, quality,\
-         _currentAcc, _currentDon
-
-         #alternativeScore = self.calcAlignmentScore(currentVMatchAlignment)
-         alternativeScores.append(self.calcAlignmentScore(currentVMatchAlignment))
+      #pdb.set_trace()
+      
+      # inlined
+      h = self.h
+      d = self.d
+      a = self.a
+      mmatrix = self.mmatrix
+      qualityPlifs = self.qualityPlifs
+      # inlined
+
+      # find an intron on the 3' end
+      _start = cpu()
+      for (don_pos,don_score) in proximal_don:
+          DonorScore = calculatePlif(d, [don_score])[0]
+
+          for (acc_pos,acc_score,acc_dna) in distal_acc:
+
+              IntronScore = calculatePlif(h, [acc_pos-don_pos])[0] 
+              AcceptorScore = calculatePlif(a, [acc_score])[0] 
+          
+              #print 'don splice: ', (don_pos,don_score), (acc_pos,acc_score,acc_dna), (DonorScore,IntronScore,AcceptorScore)
+
+              # construct a new "original_est"
+              original_est_cut=''
+              
+              est_ptr=0
+              dna_ptr=self.max_intron_size
+              ptr=0
+              acc_dna_ptr=0
+              num_mismatch = 0
+              
+              while ptr<len(original_est):
+                  #print acc_dna_ptr,len(acc_dna),acc_pos,don_pos
+                  
+                  if original_est[ptr]=='[':
+                      dnaletter=original_est[ptr+1]
+                      estletter=original_est[ptr+2]
+                      if dna_ptr < don_pos:
+                          original_est_cut+=original_est[ptr:ptr+4]
+                          num_mismatch += 1
+                      else:
+                          if acc_dna[acc_dna_ptr]==estletter:
+                              original_est_cut += estletter # EST letter
+                          else:
+                              original_est_cut += '['+acc_dna[acc_dna_ptr]+estletter+']' # EST letter
+                              num_mismatch += 1
+                              #print '['+acc_dna[acc_dna_ptr]+estletter+']' 
+                          acc_dna_ptr+=1 
+                      ptr+=4 
+                  else:
+                      dnaletter=original_est[ptr]
+                      estletter=dnaletter
+                
+                      if dna_ptr < don_pos:
+                          original_est_cut+=estletter # EST letter
+                      else:
+                          if acc_dna[acc_dna_ptr]==estletter:
+                              original_est_cut += estletter # EST letter
+                          else:
+                              num_mismatch += 1
+                              original_est_cut += '['+acc_dna[acc_dna_ptr]+estletter+']' # EST letter
+                              #print '('+acc_dna[acc_dna_ptr]+estletter+')' 
+                          acc_dna_ptr+=1 
+                          
+                      ptr+=1
+
+                  if estletter=='-':
+                      dna_ptr+=1 
+                  elif dnaletter=='-':
+                      est_ptr+=1
+                  else:
+                      dna_ptr+=1 
+                      est_ptr+=1
+              if num_mismatch>self.max_mismatch:
+                  continue
+                         
+              assert(dna_ptr<=len(dna))
+              assert(est_ptr<=len(est))
+
+              #print original_est, original_est_cut              
+
+              score = computeSpliceAlignScoreWithQuality(original_est_cut, quality, qualityPlifs, run, self.currentPhi)
+              score += AcceptorScore + IntronScore + DonorScore + self.spliced_bias 
+         
+              alternativeScores.append(score)
+
+              if acc_score>=self.splice_stop_thresh:
+                  break
+              
+      #pdb.set_trace()
+
+      _stop = cpu()
+      self.alternativeScoresTime += _stop-_start
+
+      # find an intron on the 5' end 
+      _start = cpu()
+      for (acc_pos,acc_score) in proximal_acc:
+
+          AcceptorScore = calculatePlif(a, [acc_score])[0]
+          
+          for (don_pos,don_score,don_dna) in distal_don:
+
+              DonorScore = calculatePlif(d, [don_score])[0]
+              IntronScore = calculatePlif(h, [acc_pos-don_pos])[0] 
+          
+              #print 'acc splice: ', (don_pos,don_score,don_dna), (acc_pos,acc_score), (DonorScore,IntronScore,AcceptorScore)
+
+              # construct a new "original_est"
+              original_est_cut=''
+              
+              est_ptr=0
+              dna_ptr=self.max_intron_size
+              ptr=0
+              num_mismatch = 0
+              don_dna_ptr=len(don_dna)-(acc_pos-self.max_intron_size)-1
+              while ptr<len(original_est):
+                  
+                  if original_est[ptr]=='[':
+                      dnaletter=original_est[ptr+1]
+                      estletter=original_est[ptr+2]
+                      if dna_ptr > acc_pos:
+                          original_est_cut+=original_est[ptr:ptr+4] 
+                          num_mismatch += 1
+                      else:
+                          if don_dna[don_dna_ptr]==estletter:
+                              original_est_cut += estletter # EST letter
+                          else:
+                              original_est_cut += '['+don_dna[don_dna_ptr]+estletter+']' # EST letter
+                              num_mismatch += 1
+                              #print '['+don_dna[don_dna_ptr]+estletter+']' 
+                          don_dna_ptr+=1 
+                      ptr+=4 
+                  else:
+                      dnaletter=original_est[ptr]
+                      estletter=dnaletter
+                
+                      if dna_ptr > acc_pos:
+                          original_est_cut+=estletter # EST letter
+                      else:
+                          if don_dna[don_dna_ptr]==estletter:
+                              original_est_cut += estletter # EST letter
+                          else:
+                              original_est_cut += '['+don_dna[don_dna_ptr]+estletter+']' # EST letter
+                              num_mismatch += 1
+                              #print '('+don_dna[don_dna_ptr]+estletter+')' 
+                          don_dna_ptr+=1 
+                          
+                      ptr+=1
+
+                  if estletter=='-':
+                      dna_ptr+=1 
+                  elif dnaletter=='-':
+                      est_ptr+=1
+                  else:
+                      dna_ptr+=1 
+                      est_ptr+=1
+                         
+              if num_mismatch>self.max_mismatch:
+                  continue
+              assert(dna_ptr<=len(dna))
+              assert(est_ptr<=len(est))
+
+              #print original_est, original_est_cut              
+
+              score = computeSpliceAlignScoreWithQuality(original_est_cut, quality, qualityPlifs, run, self.currentPhi)
+              score += AcceptorScore + IntronScore + DonorScore + self.spliced_bias
+         
+              alternativeScores.append(score)
+
+              if don_score>=self.splice_stop_thresh:
+                  break
+              
+      _stop = cpu()
+      self.alternativeScoresTime += _stop-_start
 
       return alternativeScores
 
@@ -253,62 +617,16 @@ class PipelineHeuristic:
       this function calculates the dot product of the feature representation of
       the alignment and the parameter vector i.e the alignment score. 
       """
-      run = self.run
-
-      h = self.h
-      d = self.d
-      a = self.a
-      mmatrix = self.mmatrix
-      qualityPlifs = self.qualityPlifs
 
-      lengthSP    = run['numLengthSuppPoints']
-      donSP       = run['numDonSuppPoints']
-      accSP       = run['numAccSuppPoints']
-      mmatrixSP   = run['matchmatrixRows']*run['matchmatrixCols']
-      numq        = run['numQualSuppPoints']
-      totalQualSP = run['totalQualSuppPoints']
-
-      currentPhi = zeros((run['numFeatures'],1))
+      start = cpu()
+      run = self.run
 
       # Lets start calculation
       dna, exons, est, original_est, quality, acc_supp, don_supp = alignment
 
-      # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)    
-      trueSpliceAlign, trueWeightMatch, trueWeightQuality ,dna_calc =\
-      computeSpliceAlignWithQuality(dna, exons, est, original_est,\
-      quality, qualityPlifs, run)
-
-      # Calculate the weights
-      trueWeightDon, trueWeightAcc, trueWeightIntron =\
-      computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
-
-      trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
+      score = computeSpliceAlignScoreWithQuality(original_est, quality, self.qualityPlifs, run, self.currentPhi)
 
-      currentPhi[0:lengthSP]                                            = mat(h.penalties[:]).reshape(lengthSP,1)
-      currentPhi[lengthSP:lengthSP+donSP]                               = mat(d.penalties[:]).reshape(donSP,1)
-      currentPhi[lengthSP+donSP:lengthSP+donSP+accSP]                   = mat(a.penalties[:]).reshape(accSP,1)
-      currentPhi[lengthSP+donSP+accSP:lengthSP+donSP+accSP+mmatrixSP]   = mmatrix[:]
-
-      totalQualityPenalties = self.param[-totalQualSP:]
-      currentPhi[lengthSP+donSP+accSP+mmatrixSP:]                    = totalQualityPenalties[:]
-
-      # Calculate w'phi(x,y) the total score of the alignment
-      return (trueWeight.T * currentPhi)[0,0]
-
-
-if __name__ == '__main__':
-   #run_fname = sys.argv[1]
-   #data_fname = sys.argv[2]
-   #param_filename = sys.argv[3]
-
-   dir = '/fml/ag-raetsch/home/fabio/tmp/QPalma_test/run_+_quality_+_splicesignals_+_intron_len'
-   jp = os.path.join
-
-   run_fname      = jp(dir,'run_object.pickle')
-   #data_fname     = '/fml/ag-raetsch/share/projects/qpalma/solexa/current_data/map.vm_unspliced_flag'
-   data_fname     = '/fml/ag-raetsch/share/projects/qpalma/solexa/current_data/map.vm_spliced_flag'
-
-   param_fname = jp(dir,'param_500.pickle')
-
-   ph1 = PipelineHeuristic(run_fname,data_fname,param_fname)
-   ph1.filter()
+      stop = cpu()
+      self.calcAlignmentScoreTime += stop-start
+      
+      return score