+ added some testcases
[qpalma.git] / scripts / PipelineHeuristic.py
index 175e18d..e6e3a39 100644 (file)
@@ -6,47 +6,50 @@ import sys
 import pdb
 import os
 import os.path
-import math
 import resource
 
-from qpalma.DataProc import *
 from qpalma.computeSpliceWeights import *
 from qpalma.set_param_palma import *
 from qpalma.computeSpliceAlignWithQuality import *
-from qpalma.penalty_lookup_new import *
-from qpalma.compute_donacc import *
-from qpalma.TrainingParam import Param
-from qpalma.Plif import Plf
-
-#from qpalma.tools.splicesites import getDonAccScores
-from qpalma.Configuration import *
-
-#from compile_dataset import getSpliceScores, get_seq_and_scores
 
 from numpy.matlib import mat,zeros,ones,inf
 from numpy import inf,mean
 
 from qpalma.parsers import *
 
-from Lookup import *
+from ParaParser import *
+
+from qpalma.Lookup import LookupTable
 
+from qpalma.sequence_utils import reverse_complement,unbracket_seq
 
-def unbracket_est(est):
-   new_est = ''
-   e = 0
 
-   while True:
-      if e >= len(est):
-         break
+class BracketWrapper:
+   #fields = ['id', 'chr', 'pos', 'strand', 'mismatches', 'length',\
+   #'offset', 'seq', 'prb', 'cal_prb', 'chastity']
+   fields = ['id', 'chr', 'pos', 'strand', 'seq', 'prb']
+
+   def __init__(self,filename):
+      self.parser = ParaParser("%lu%d%d%s%s%s",self.fields,len(self.fields),IN_VECTOR)
+      self.parser.parseFile(filename)
+
+   def __len__(self):
+      return self.parser.getSize(IN_VECTOR)
+      
+   def __getitem__(self,key):
+      return self.parser.fetchEntry(key)
 
-      if est[e] == '[':
-         new_est += est[e+2]
-         e += 4
-      else:
-         new_est += est[e]
-         e += 1
+   def __iter__(self):
+      self.counter = 0
+      self.size = self.parser.getSize(IN_VECTOR)
+      return self
 
-   return "".join(new_est).lower()
+   def next(self):
+      if not self.counter < self.size:
+         raise StopIteration
+      return_val = self.parser.fetchEntry(self.counter)
+      self.counter += 1
+      return return_val
 
 
 class PipelineHeuristic:
@@ -55,7 +58,7 @@ class PipelineHeuristic:
    vmatch is spliced an should be then newly aligned using QPalma or not.
    """
 
-   def __init__(self,run_fname,data_fname,param_fname,result_spliced_fname,result_unspliced_fname):
+   def __init__(self,run_fname,data_fname,param_fname,result_fname):
       """
       We need a run object holding information about the nr. of support points
       etc.
@@ -64,22 +67,40 @@ class PipelineHeuristic:
       run = cPickle.load(open(run_fname))
       self.run = run
 
-      dna_flat_files    =  '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
-
       start = cpu()
-      self.all_remapped_reads = parse_map_vm_heuristic(data_fname)
+      # old version
+      #self.all_remapped_reads = parse_map_vm_heuristic(data_fname)
+      self.all_remapped_reads = BracketWrapper(data_fname)
       stop = cpu()
 
       print 'parsed %d reads in %f sec' % (len(self.all_remapped_reads),stop-start)
 
+      #g_dir    = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/contigs'
+      #acc_dir  = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/splice_scores/acc'
+      #don_dir  = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/splice_scores/don'
+
+      #g_fmt = 'contig%d.dna.flat'
+      #s_fmt = 'contig_%d%s'
+
+      #self.chromo_range = range(0,1099)
+
+      g_dir    = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
+      acc_dir  = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
+      don_dir  = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
+
+      g_fmt = 'chr%d.dna.flat'
+      s_fmt = 'contig_%d%s'
+
+      self.chromo_range = range(1,6)
 
       start = cpu()
-      self.lt1 = Lookup(dna_flat_files)
+      self.lt1 = LookupTable(g_dir,acc_dir,don_dir,g_fmt,s_fmt,self.chromo_range)
       stop = cpu()
+
       print 'prefetched sequence and splice data in %f sec' % (stop-start)
 
-      self.result_spliced_fh = open(result_spliced_fname,'w+')
-      self.result_unspliced_fh = open(result_unspliced_fname,'w+')
+      self.result_spliced_fh = open('%s.spliced'%result_fname,'w+')
+      self.result_unspliced_fh = open('%s.unspliced'%result_fname,'w+')
 
       start = cpu()
 
@@ -96,7 +117,7 @@ class PipelineHeuristic:
       self.mmatrix = mmatrix
       self.qualityPlifs = qualityPlifs
 
-      self.read_size    = 36
+      self.read_size    = 38
 
       # parameters of the heuristics to decide whether the read is spliced
       #self.splice_thresh      = 0.005
@@ -124,13 +145,14 @@ class PipelineHeuristic:
          self.result_spliced_fh.write(p_line)
          self.result_unspliced_fh.write(p_line)
    
-      self.original_reads = {}
+      #self.original_reads = {}
 
-      for line in open('/fml/ag-raetsch/share/projects/qpalma/solexa/allReads.pipeline'):
-         line = line.strip()
-         id,seq,q1,q2,q3 = line.split()
-         id = int(id)
-         self.original_reads[id] = seq
+      # we do not have this information
+      #for line in open(reads_pipeline_fn):
+      #   line = line.strip()
+      #   id,seq,q1,q2,q3 = line.split()
+      #   id = int(id)
+      #   self.original_reads[id] = seq
 
       lengthSP    = run['numLengthSuppPoints']
       donSP       = run['numDonSuppPoints']
@@ -198,18 +220,17 @@ class PipelineHeuristic:
          if ctr % 1000 == 0:
             print ctr
 
-
          id       = location['id']
          chr      = location['chr']
          pos      = location['pos']
          strand   = location['strand']
-         mismatch = location['mismatches']
-         length   = location['length']
-         off      = location['offset']
+         #mismatch = location['mismatches']
+         #length   = location['length']
+         #off      = location['offset']
          seq      = location['seq']
          prb      = location['prb']
-         cal_prb  = location['cal_prb']
-         chastity = location['chastity']
+         #cal_prb  = location['cal_prb']
+         #chastity = location['chastity']
 
          id = int(id)
 
@@ -219,18 +240,26 @@ class PipelineHeuristic:
 
          strand = strand_map[strand]
 
-         if strand == '-':
+         if not chr in self.chromo_range:
             continue
 
-         unb_seq = unbracket_est(seq)
+         unb_seq = unbracket_seq(seq)
+
+         # forgot to do this
+         if strand == '-':
+            #pos = self.lt1.seqInfo.chromo_sizes[chr]-pos-self.read_size
+            unb_seq = reverse_complement(unb_seq)
+
          effective_len = len(unb_seq)
 
          genomicSeq_start  = pos
-         genomicSeq_stop   = pos+effective_len-1
+         #genomicSeq_stop   = pos+effective_len-1
+         genomicSeq_stop   = pos+effective_len
 
          start = cpu()
          #currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,run['dna_flat_files'])
          currentDNASeq, currentAcc, currentDon = self.lt1.get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,run['dna_flat_files'])
+
          stop = cpu()
          self.get_time += stop-start
 
@@ -240,7 +269,7 @@ class PipelineHeuristic:
          exons[1,0]     = effective_len
          est            = unb_seq
          original_est   = seq
-         quality        = prb
+         quality        = map(lambda x:ord(x)-64,prb)
 
          #pdb.set_trace()
 
@@ -370,10 +399,11 @@ class PipelineHeuristic:
       strand   = location['strand']
       original_est = location['seq']
       quality      = location['prb']
-      cal_prb  = location['cal_prb']
+      quality        = map(lambda x:ord(x)-64,quality)
+      #cal_prb  = location['cal_prb']
       
       original_est = original_est.lower()
-      est = unbracket_est(original_est)
+      est = unbracket_seq(original_est)
       effective_len = len(est)
 
       genomicSeq_start  = pos - self.max_intron_size
@@ -596,6 +626,7 @@ def cpu():
 if __name__ == '__main__':
    if len(sys.argv) != 6:
       print 'Usage: ./%s data param run spliced.results unspliced.results' % (sys.argv[0])
+      sys.exit(1)
 
    data_fname = sys.argv[1]
    param_fname = sys.argv[2]
@@ -606,14 +637,6 @@ if __name__ == '__main__':
 
    jp = os.path.join
 
-   #dir = '/fml/ag-raetsch/home/fabio/tmp/QPalma_test/run_+_quality_+_splicesignals_+_intron_len'
-   #param_fname = jp(dir,'param_500.pickle')
-   #data_fname = '/fml/ag-raetsch/share/projects/qpalma/solexa/current_data/map.vm_unspliced_20k'
-   #data_fname  = '/fml/ag-raetsch/share/projects/qpalma/solexa/pipeline_data/map.vm_2k'
-   #data_fname  = '/fml/ag-raetsch/share/projects/qpalma/solexa/pipeline_data/map.vm_100'
-   #result_spliced_fname    = 'splicedReads.heuristic'
-   #result_unspliced_fname  = 'unsplicedReads.heuristic'
-
    ph1 = PipelineHeuristic(run_fname,data_fname,param_fname,result_spliced_fname,result_unspliced_fname)
 
    start = cpu()