+ changed PipelinHeuristic to support new data access functions
[qpalma.git] / scripts / PipelineHeuristic.py
index 5f7a2f6..93d6265 100644 (file)
@@ -6,7 +6,6 @@ import sys
 import pdb
 import os
 import os.path
-import math
 import resource
 
 from qpalma.computeSpliceWeights import *
@@ -20,18 +19,18 @@ from qpalma.parsers import *
 
 from ParaParser import *
 
-from Lookup import Lookup
+from qpalma.Lookup import LookupTable
 
 from qpalma.sequence_utils import reverse_complement,unbracket_seq
 
 
-
 class BracketWrapper:
-   fields = ['id', 'chr', 'pos', 'strand', 'mismatches', 'length',\
-   'offset', 'seq', 'prb', 'cal_prb', 'chastity']
+   #fields = ['id', 'chr', 'pos', 'strand', 'mismatches', 'length',\
+   #'offset', 'seq', 'prb', 'cal_prb', 'chastity']
+   fields = ['id', 'chr', 'pos', 'strand', 'seq', 'prb']
 
    def __init__(self,filename):
-      self.parser = ParaParser("%lu%d%d%s%d%d%d%s%s%s%s",self.fields,len(self.fields),IN_VECTOR)
+      self.parser = ParaParser("%lu%d%d%s%s%s",self.fields,len(self.fields),IN_VECTOR)
       self.parser.parseFile(filename)
 
    def __len__(self):
@@ -68,23 +67,36 @@ class PipelineHeuristic:
       run = cPickle.load(open(run_fname))
       self.run = run
 
-      dna_flat_files    =  '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
-
-
       start = cpu()
-
       # old version
       #self.all_remapped_reads = parse_map_vm_heuristic(data_fname)
       self.all_remapped_reads = BracketWrapper(data_fname)
-      
       stop = cpu()
 
       print 'parsed %d reads in %f sec' % (len(self.all_remapped_reads),stop-start)
 
+      #g_dir    = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/contigs'
+      #acc_dir  = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/splice_scores/acc'
+      #don_dir  = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/splice_scores/don'
+
+      #g_fmt = 'contig%d.dna.flat'
+      #s_fmt = 'contig_%d%s'
+
+      #self.chromo_range = range(0,1099)
+
+      g_dir    = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
+      acc_dir  = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
+      don_dir  = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
+
+      g_fmt = 'chr%d.dna.flat'
+      s_fmt = 'contig_%d%s'
+
+      self.chromo_range = range(1,6)
 
       start = cpu()
-      self.lt1 = Lookup(dna_flat_files)
+      self.lt1 = LookupTable(g_dir,acc_dir,don_dir,g_fmt,s_fmt,self.chromo_range)
       stop = cpu()
+
       print 'prefetched sequence and splice data in %f sec' % (stop-start)
 
       self.result_spliced_fh = open('%s.spliced'%result_fname,'w+')
@@ -105,7 +117,7 @@ class PipelineHeuristic:
       self.mmatrix = mmatrix
       self.qualityPlifs = qualityPlifs
 
-      self.read_size    = 36
+      self.read_size    = 38
 
       # parameters of the heuristics to decide whether the read is spliced
       #self.splice_thresh      = 0.005
@@ -212,13 +224,13 @@ class PipelineHeuristic:
          chr      = location['chr']
          pos      = location['pos']
          strand   = location['strand']
-         mismatch = location['mismatches']
-         length   = location['length']
-         off      = location['offset']
+         #mismatch = location['mismatches']
+         #length   = location['length']
+         #off      = location['offset']
          seq      = location['seq']
          prb      = location['prb']
-         cal_prb  = location['cal_prb']
-         chastity = location['chastity']
+         #cal_prb  = location['cal_prb']
+         #chastity = location['chastity']
 
          id = int(id)
 
@@ -228,14 +240,14 @@ class PipelineHeuristic:
 
          strand = strand_map[strand]
 
-         if not chr in range(1,6):
+         if not chr in self.chromo_range:
             continue
 
          unb_seq = unbracket_seq(seq)
 
          # forgot to do this
          if strand == '-':
-            pos = self.lt1.seqInfo.chromo_sizes[chr+7]-pos-self.read_size
+            pos = self.lt1.seqInfo.chromo_sizes[chr]-pos-self.read_size
             unb_seq = reverse_complement(unb_seq)
 
          effective_len = len(unb_seq)
@@ -256,7 +268,7 @@ class PipelineHeuristic:
          exons[1,0]     = effective_len
          est            = unb_seq
          original_est   = seq
-         quality        = prb
+         quality        = map(lambda x:ord(x)-50,prb)
 
          #pdb.set_trace()
 
@@ -386,7 +398,8 @@ class PipelineHeuristic:
       strand   = location['strand']
       original_est = location['seq']
       quality      = location['prb']
-      cal_prb  = location['cal_prb']
+      quality        = map(lambda x:ord(x)-50,quality)
+      #cal_prb  = location['cal_prb']
       
       original_est = original_est.lower()
       est = unbracket_seq(original_est)