+ minor changes in the paths
[qpalma.git] / scripts / PipelineHeuristic.py
index 3686991..f4050bc 100644 (file)
@@ -6,45 +6,50 @@ import sys
 import pdb
 import os
 import os.path
 import pdb
 import os
 import os.path
-import math
 import resource
 
 import resource
 
-from qpalma.DataProc import *
 from qpalma.computeSpliceWeights import *
 from qpalma.set_param_palma import *
 from qpalma.computeSpliceAlignWithQuality import *
 from qpalma.computeSpliceWeights import *
 from qpalma.set_param_palma import *
 from qpalma.computeSpliceAlignWithQuality import *
-from qpalma.penalty_lookup_new import *
-from qpalma.compute_donacc import *
-from qpalma.TrainingParam import Param
-from qpalma.Plif import Plf
-
-#from qpalma.tools.splicesites import getDonAccScores
-from qpalma.Configuration import *
-
-from compile_dataset import getSpliceScores, get_seq_and_scores
 
 from numpy.matlib import mat,zeros,ones,inf
 from numpy import inf,mean
 
 
 from numpy.matlib import mat,zeros,ones,inf
 from numpy import inf,mean
 
-from qpalma.parsers import PipelineReadParser
+from ParaParser import ParaParser,IN_VECTOR
+
+from qpalma.Lookup import LookupTable
+from qpalma.sequence_utils import SeqSpliceInfo,DataAccessWrapper,reverse_complement,unbracket_seq
+
+
+def cpu():
+   return (resource.getrusage(resource.RUSAGE_SELF).ru_utime+\
+   resource.getrusage(resource.RUSAGE_SELF).ru_stime) 
+
 
 
+class BracketWrapper:
+   fields = ['id', 'chr', 'pos', 'strand', 'seq', 'prb']
 
 
-def unbracket_est(est):
-   new_est = ''
-   e = 0
+   def __init__(self,filename):
+      self.parser = ParaParser("%lu%d%d%s%s%s",self.fields,len(self.fields),IN_VECTOR)
+      self.parser.parseFile(filename)
 
 
-   while True:
-      if e >= len(est):
-         break
+   def __len__(self):
+      return self.parser.getSize(IN_VECTOR)
+      
+   def __getitem__(self,key):
+      return self.parser.fetchEntry(key)
 
 
-      if est[e] == '[':
-         new_est += est[e+2]
-         e += 4
-      else:
-         new_est += est[e]
-         e += 1
+   def __iter__(self):
+      self.counter = 0
+      self.size = self.parser.getSize(IN_VECTOR)
+      return self
 
 
-   return "".join(new_est).lower()
+   def next(self):
+      if not self.counter < self.size:
+         raise StopIteration
+      return_val = self.parser.fetchEntry(self.counter)
+      self.counter += 1
+      return return_val
 
 
 class PipelineHeuristic:
 
 
 class PipelineHeuristic:
@@ -53,7 +58,7 @@ class PipelineHeuristic:
    vmatch is spliced an should be then newly aligned using QPalma or not.
    """
 
    vmatch is spliced an should be then newly aligned using QPalma or not.
    """
 
-   def __init__(self,run_fname,data_fname,param_fname,result_spliced_fname,result_unspliced_fname):
+   def __init__(self,run_fname,data_fname,param_fname,result_fname):
       """
       We need a run object holding information about the nr. of support points
       etc.
       """
       We need a run object holding information about the nr. of support points
       etc.
@@ -62,8 +67,43 @@ class PipelineHeuristic:
       run = cPickle.load(open(run_fname))
       self.run = run
 
       run = cPickle.load(open(run_fname))
       self.run = run
 
-      self.result_spliced_fh = open(result_spliced_fname,'w+')
-      self.result_unspliced_fh = open(result_unspliced_fname,'w+')
+      start = cpu()
+      self.all_remapped_reads = BracketWrapper(data_fname)
+      stop = cpu()
+
+      print 'parsed %d reads in %f sec' % (len(self.all_remapped_reads),stop-start)
+
+      #g_dir    = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/contigs'
+      #acc_dir  = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/splice_scores/acc'
+      #don_dir  = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/splice_scores/don'
+
+      #g_fmt = 'contig%d.dna.flat'
+      #s_fmt = 'contig_%d%s'
+
+      #self.chromo_range = range(0,1099)
+
+      g_dir    = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
+      acc_dir  = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
+      don_dir  = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
+
+      g_fmt = 'chr%d.dna.flat'
+      s_fmt = 'contig_%d%s'
+
+      self.chromo_range = range(1,6)
+      #self.chromo_range = range(1,2)
+
+      num_chromo = 2
+      accessWrapper = DataAccessWrapper(g_dir,acc_dir,don_dir,g_fmt,s_fmt)
+      self.seqInfo = SeqSpliceInfo(accessWrapper,range(1,num_chromo))
+
+      start = cpu()
+      self.lt1 = LookupTable(g_dir,acc_dir,don_dir,g_fmt,s_fmt,self.chromo_range)
+      stop = cpu()
+
+      print 'prefetched sequence and splice data in %f sec' % (stop-start)
+
+      self.result_spliced_fh = open('%s.spliced'%result_fname,'w+')
+      self.result_unspliced_fh = open('%s.unspliced'%result_fname,'w+')
 
       start = cpu()
 
 
       start = cpu()
 
@@ -80,22 +120,44 @@ class PipelineHeuristic:
       self.mmatrix = mmatrix
       self.qualityPlifs = qualityPlifs
 
       self.mmatrix = mmatrix
       self.qualityPlifs = qualityPlifs
 
-      self.read_size    = 36
+      self.read_size    = 38
+      self.prb_offset   = 64
+      #self.prb_offset   = 50
 
       # parameters of the heuristics to decide whether the read is spliced
 
       # parameters of the heuristics to decide whether the read is spliced
-      self.splice_thresh      = 0.005
+      #self.splice_thresh      = 0.005
+      self.splice_thresh      = 0.01
       self.max_intron_size    = 2000
       self.max_mismatch       = 2 
       self.max_intron_size    = 2000
       self.max_mismatch       = 2 
-      self.splice_stop_thresh = 0.99
+      #self.splice_stop_thresh = 0.99
+      self.splice_stop_thresh = 0.8
       self.spliced_bias       = 0.0
       self.spliced_bias       = 0.0
+
+      param_lines = [\
+      ('%f','splice_thresh',self.splice_thresh),\
+      ('%d','max_intron_size',self.max_intron_size),\
+      ('%d','max_mismatch',self.max_mismatch),\
+      ('%f','splice_stop_thresh',self.splice_stop_thresh),\
+      ('%f','spliced_bias',self.spliced_bias)]
+
+      param_lines = [('# %s: '+form+'\n')%(name,val) for form,name,val in param_lines]
+      param_lines.append('# data: %s\n'%self.data_fname)
+      param_lines.append('# param: %s\n'%param_fname)
+
+      #pdb.set_trace()
+
+      for p_line in param_lines:
+         self.result_spliced_fh.write(p_line)
+         self.result_unspliced_fh.write(p_line)
    
    
-      self.original_reads = {}
+      #self.original_reads = {}
 
 
-      for line in open('/fml/ag-raetsch/share/projects/qpalma/solexa/allReads.pipeline'):
-         line = line.strip()
-         id,seq,q1,q2,q3 = line.split()
-         id = int(id)
-         self.original_reads[id] = seq
+      # we do not have this information
+      #for line in open(reads_pipeline_fn):
+      #   line = line.strip()
+      #   id,seq,q1,q2,q3 = line.split()
+      #   id = int(id)
+      #   self.original_reads[id] = seq
 
       lengthSP    = run['numLengthSuppPoints']
       donSP       = run['numDonSuppPoints']
 
       lengthSP    = run['numLengthSuppPoints']
       donSP       = run['numDonSuppPoints']
@@ -131,7 +193,6 @@ class PipelineHeuristic:
       self.alternativeScoresTime = 0.0
 
       self.count_time = 0.0
       self.alternativeScoresTime = 0.0
 
       self.count_time = 0.0
-      self.read_parsing = 0.0
       self.main_loop = 0.0
       self.splice_site_time = 0.0
       self.computeSpliceAlignWithQualityTime = 0.0
       self.main_loop = 0.0
       self.splice_site_time = 0.0
       self.computeSpliceAlignWithQualityTime = 0.0
@@ -148,15 +209,6 @@ class PipelineHeuristic:
       """
       run = self.run 
 
       """
       run = self.run 
 
-      start = cpu()
-
-      rrp = PipelineReadParser(self.data_fname)
-      all_remapped_reads = rrp.parse()
-
-      stop = cpu()
-
-      self.read_parsing = stop-start
-
       ctr = 0
       unspliced_ctr  = 0
       spliced_ctr    = 0
       ctr = 0
       unspliced_ctr  = 0
       spliced_ctr    = 0
@@ -164,114 +216,127 @@ class PipelineHeuristic:
       print 'Starting filtering...'
       _start = cpu()
 
       print 'Starting filtering...'
       _start = cpu()
 
-      for readId,currentReadLocations in all_remapped_reads.items():
-         for location in currentReadLocations[:1]:
+      #for readId,currentReadLocations in all_remapped_reads.items():
+         #for location in currentReadLocations[:1]:
 
 
-            id       = location['id']
-            chr      = location['chr']
-            pos      = location['pos']
-            strand   = location['strand']
-            mismatch = location['mismatches']
-            length   = location['length']
-            off      = location['offset']
-            seq      = location['seq']
-            prb      = location['prb']
-            cal_prb  = location['cal_prb']
-            chastity = location['chastity']
+      for location,original_line in self.all_remapped_reads:
+         
+         if ctr % 1000 == 0:
+            print ctr
 
 
-            id = int(id)
+         id       = location['id']
+         chr      = location['chr']
+         pos      = location['pos']
+         strand   = location['strand']
+         seq      = location['seq']
+         prb      = location['prb']
 
 
-            strand_map = {'+':'D', '-':'P'}
+         id = int(id)
 
 
+         seq = seq.lower()
 
 
-            original_line = '%d\t%d\t%d\t%s\t%d\t%d\t%d\t%s\t%s\t%s\t%s\n' %\
-            (id,chr,pos,strand_map[strand],int(mismatch),int(length),int(off),seq.upper(),prb,cal_prb,chastity)
+         strand_map = {'D':'+', 'P':'-'}
 
 
+         strand = strand_map[strand]
 
 
-            if strand == '-':
-               continue
+         if not chr in self.chromo_range:
+            continue
 
 
-            if ctr == 1000:
-               break
+         unb_seq = unbracket_seq(seq)
 
 
-            #if pos > 10000000:
-            #   continue
-      
-            unb_seq = unbracket_est(seq)
-            effective_len = len(unb_seq)
+         # forgot to do this
+         if strand == '-':
+            #pos = self.lt1.seqInfo.chromo_sizes[chr]-pos-self.read_size
+            unb_seq = reverse_complement(unb_seq)
+            seq = reverse_complement(seq)
 
 
-            genomicSeq_start  = pos
-            genomicSeq_stop   = pos+effective_len-1
+         effective_len = len(unb_seq)
 
 
-            start = cpu()
-            #print genomicSeq_start,genomicSeq_stop
-            currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,run['dna_flat_files'])
-            stop = cpu()
-            self.get_time += stop-start
+         genomicSeq_start  = pos
+         #genomicSeq_stop   = pos+effective_len-1
+         genomicSeq_stop   = pos+effective_len
 
 
-            dna            = currentDNASeq
-            exons          = zeros((2,1))
-            exons[0,0]     = 0
-            exons[1,0]     = effective_len
-            est            = unb_seq
-            original_est   = seq
-            quality        = prb
+         start = cpu()
+         #currentDNASeq_, currentAcc_, currentDon_ = self.seqInfo.get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,False)
+         currentDNASeq, currentAcc, currentDon = self.lt1.get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop)
 
 
-            #pdb.set_trace()
+         # just performed some tests to check LookupTable results
+         #assert currentDNASeq_ == currentDNASeq
+         #assert currentAcc_ == currentAcc_
+         #assert currentDon_ == currentDon_
 
 
-            currentVMatchAlignment = dna, exons, est, original_est, quality,\
-            currentAcc, currentDon
+         stop = cpu()
+         self.get_time += stop-start
 
 
-            alternativeAlignmentScores = self.calcAlternativeAlignments(location)
+         dna            = currentDNASeq
+         exons          = zeros((2,1))
+         exons[0,0]     = 0
+         exons[1,0]     = effective_len
+         est            = unb_seq
+         original_est   = seq
+         quality        = map(lambda x:ord(x)-self.prb_offset,prb)
 
 
-            if alternativeAlignmentScores == []:
-                # no alignment necessary
-                maxAlternativeAlignmentScore = -inf
-                vMatchScore = 0.0 
-            else:
-                maxAlternativeAlignmentScore = max(alternativeAlignmentScores)
-                # compute alignment for vmatch unspliced read
-                vMatchScore = self.calcAlignmentScore(currentVMatchAlignment)
+         currentVMatchAlignment = dna, exons, est, original_est, quality,\
+         currentAcc, currentDon
+
+         #pdb.set_trace()
 
 
-            start = cpu()
-            
-            #print 'vMatchScore/alternativeScore: %f %f ' % (vMatchScore,maxAlternativeAlignmentScore)
-            #print 'all candidates %s' % str(alternativeAlignmentScores)
+         #alternativeAlignmentScores = self.calcAlternativeAlignments(location)
 
 
-            new_id = id - 1000000300000
+         try:
+            alternativeAlignmentScores = self.calcAlternativeAlignments(location)
+         except:
+            alternativeAlignmentScores = []
+
+         if alternativeAlignmentScores == []:
+             # no alignment necessary
+             maxAlternativeAlignmentScore = -inf
+             vMatchScore = 0.0 
+         else:
+             maxAlternativeAlignmentScore = max(alternativeAlignmentScores)
+             # compute alignment for vmatch unspliced read
+             vMatchScore = self.calcAlignmentScore(currentVMatchAlignment)
+             #print 'vMatchScore/alternativeScore: %f %f %s' % (vMatchScore,maxAlternativeAlignmentScore,strand)
+             
+
+         start = cpu()
+         
+         #print 'all candidates %s' % str(alternativeAlignmentScores)
 
 
-            unspliced = False
-            # unspliced 
-            if new_id > 0: 
-               unspliced = True
+         new_id = id - 1000000300000
 
 
-            # Seems that according to our learned parameters VMatch found a good
-            # alignment of the current read
-            if maxAlternativeAlignmentScore < vMatchScore:
-               unspliced_ctr += 1
+         unspliced = False
+         # unspliced 
+         if new_id > 0: 
+            unspliced = True
 
 
-               self.result_unspliced_fh.write(original_line) 
+         # Seems that according to our learned parameters VMatch found a good
+         # alignment of the current read
+         if maxAlternativeAlignmentScore < vMatchScore:
+            unspliced_ctr += 1
 
 
-               if unspliced:
-                  self.true_neg += 1
-               else:
-                  self.false_neg += 1
+            self.result_unspliced_fh.write(original_line+'\n') 
 
 
-            # We found an alternative alignment considering splice sites that scores
-            # higher than the VMatch alignment
+            if unspliced:
+               self.true_neg += 1
             else:
             else:
-               spliced_ctr += 1
+               self.false_neg += 1
 
 
-               self.result_spliced_fh.write(original_line)
+         # We found an alternative alignment considering splice sites that scores
+         # higher than the VMatch alignment
+         else:
+            spliced_ctr += 1
 
 
-               if unspliced:
-                  self.false_pos += 1
-               else:
-                  self.true_pos += 1
+            self.result_spliced_fh.write(original_line+'\n')
 
 
-            ctr += 1
-            stop = cpu()
-            self.count_time = stop-start
+            if unspliced:
+               self.false_pos += 1
+            else:
+               self.true_pos += 1
+
+         ctr += 1
+         stop = cpu()
+         self.count_time = stop-start
 
       _stop = cpu()
       self.main_loop = _stop-_start
 
       _stop = cpu()
       self.main_loop = _stop-_start
@@ -307,7 +372,6 @@ class PipelineHeuristic:
       #distal_acc.sort(lambda x,y: signum(x[1]-y[1])) 
       #distal_acc=distal_acc[-2:]
 
       #distal_acc.sort(lambda x,y: signum(x[1]-y[1])) 
       #distal_acc=distal_acc[-2:]
 
-
       proximal_don   = []
       for idx in xrange(max_intron_size+read_size/2, max_intron_size+read_size):
          if currentDon[idx] >= splice_thresh:
       proximal_don   = []
       for idx in xrange(max_intron_size+read_size/2, max_intron_size+read_size):
          if currentDon[idx] >= splice_thresh:
@@ -326,6 +390,7 @@ class PipelineHeuristic:
 
       return proximal_acc,proximal_don,distal_acc,distal_don
 
 
       return proximal_acc,proximal_don,distal_acc,distal_don
 
+
    def calcAlternativeAlignments(self,location):
       """
       Given an alignment proposed by Vmatch this function calculates possible
    def calcAlternativeAlignments(self,location):
       """
       Given an alignment proposed by Vmatch this function calculates possible
@@ -341,16 +406,25 @@ class PipelineHeuristic:
       strand   = location['strand']
       original_est = location['seq']
       quality      = location['prb']
       strand   = location['strand']
       original_est = location['seq']
       quality      = location['prb']
-      cal_prb  = location['cal_prb']
+      quality        = map(lambda x:ord(x)-self.prb_offset,quality)
       
       
-      est = unbracket_est(original_est)
+      original_est = original_est.lower()
+      est = unbracket_seq(original_est)
       effective_len = len(est)
 
       genomicSeq_start  = pos - self.max_intron_size
       genomicSeq_stop   = pos + self.max_intron_size + len(est)
 
       effective_len = len(est)
 
       genomicSeq_start  = pos - self.max_intron_size
       genomicSeq_stop   = pos + self.max_intron_size + len(est)
 
+      strand_map = {'D':'+', 'P':'-'}
+      strand = strand_map[strand]
+
+      if strand == '-':
+         est = reverse_complement(est)
+         original_est = reverse_complement(original_est)
+
       start = cpu()
       start = cpu()
-      currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chr, strand, genomicSeq_start, genomicSeq_stop, run['dna_flat_files'])
+      #currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chr, strand, genomicSeq_start, genomicSeq_stop, run['dna_flat_files'])
+      currentDNASeq, currentAcc, currentDon = self.lt1.get_seq_and_scores(chr, strand, genomicSeq_start, genomicSeq_stop)
       stop = cpu()
       self.get_time += stop-start
       dna   = currentDNASeq
       stop = cpu()
       self.get_time += stop-start
       dna   = currentDNASeq
@@ -358,6 +432,8 @@ class PipelineHeuristic:
       proximal_acc,proximal_don,distal_acc,distal_don = self.findHighestScoringSpliceSites(currentAcc, currentDon, dna, self.max_intron_size, len(est), self.splice_thresh)
        
       alternativeScores = []
       proximal_acc,proximal_don,distal_acc,distal_don = self.findHighestScoringSpliceSites(currentAcc, currentDon, dna, self.max_intron_size, len(est), self.splice_thresh)
        
       alternativeScores = []
+
+      #pdb.set_trace()
       
       # inlined
       h = self.h
       
       # inlined
       h = self.h
@@ -446,6 +522,8 @@ class PipelineHeuristic:
               if acc_score>=self.splice_stop_thresh:
                   break
               
               if acc_score>=self.splice_stop_thresh:
                   break
               
+      #pdb.set_trace()
+
       _stop = cpu()
       self.alternativeScoresTime += _stop-_start
 
       _stop = cpu()
       self.alternativeScoresTime += _stop-_start
 
@@ -552,55 +630,3 @@ class PipelineHeuristic:
       self.calcAlignmentScoreTime += stop-start
       
       return score 
       self.calcAlignmentScoreTime += stop-start
       
       return score 
-
-
-def cpu():
-   return (resource.getrusage(resource.RUSAGE_SELF).ru_utime+\
-   resource.getrusage(resource.RUSAGE_SELF).ru_stime) 
-
-
-if __name__ == '__main__':
-   #run_fname = sys.argv[1]
-   #data_fname = sys.argv[2]
-   #param_filename = sys.argv[3]
-
-   dir = '/fml/ag-raetsch/home/fabio/tmp/QPalma_test/run_+_quality_+_splicesignals_+_intron_len'
-   jp = os.path.join
-
-   run_fname   = jp(dir,'run_object.pickle')
-   data_fname = '/fml/ag-raetsch/share/projects/qpalma/solexa/current_data/map.vm_unspliced_1k'
-
-   #data_fname  = '/fml/ag-raetsch/share/projects/qpalma/solexa/pipeline_data/map.vm_2k'
-   #data_fname  = '/fml/ag-raetsch/share/projects/qpalma/solexa/pipeline_data/map.vm_100'
-
-   param_fname = jp(dir,'param_500.pickle')
-
-   result_spliced_fname    = 'splicedReads.heuristic'
-   result_unspliced_fname  = 'unsplicedReads.heuristic'
-
-   ph1 = PipelineHeuristic(run_fname,data_fname,param_fname,result_spliced_fname,result_unspliced_fname)
-
-   start = cpu()
-   ph1.filter()
-   stop = cpu()
-
-   print 'total time elapsed: %f' % (stop-start)
-   print 'time spend for get seq: %f' % ph1.get_time
-   print 'time spend for calcAlignmentScoreTime: %f' %  ph1.calcAlignmentScoreTime
-   print 'time spend for alternativeScoresTime: %f' % ph1.alternativeScoresTime
-   print 'time spend for count time: %f' % ph1.count_time
-   print 'time spend for init time: %f' % ph1.init_time
-   print 'time spend for read_parsing time: %f' % ph1.read_parsing
-   print 'time spend for main_loop time: %f' % ph1.main_loop
-   print 'time spend for splice_site_time time: %f' % ph1.splice_site_time
-
-   print 'time spend for computeSpliceAlignWithQualityTime time: %f'% ph1.computeSpliceAlignWithQualityTime
-   print 'time spend for computeSpliceWeightsTime time: %f'% ph1.computeSpliceWeightsTime
-   print 'time spend for DotProdTime time: %f'% ph1.DotProdTime
-   print 'time spend forarray_stuff time: %f'% ph1.array_stuff
-   #import cProfile
-   #cProfile.run('ph1.filter()')
-
-   #import hotshot
-   #p = hotshot.Profile('profile.log')
-   #p.runcall(ph1.filter)