+ minor modifications for the prefetching the whole negative strand
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Mon, 4 Aug 2008 14:34:00 +0000 (14:34 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Mon, 4 Aug 2008 14:34:00 +0000 (14:34 +0000)
git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@10316 e1793c9e-67f9-0310-80fc-b846ff1f7b36

qpalma/Lookup.py [new file with mode: 0644]
qpalma/sequence_utils.py
scripts/PipelineHeuristic.py

diff --git a/qpalma/Lookup.py b/qpalma/Lookup.py
new file mode 100644 (file)
index 0000000..7d78bcb
--- /dev/null
@@ -0,0 +1,102 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import sys
+import os
+
+from qpalma.sequence_utils import SeqSpliceInfo,get_flat_file_size,DataAccessWrapper
+
+
+class LookupTable:
+   """
+   Speed up genomic and splice site score queries by prefetching the sequences
+   and the scores once for all chromosomes.
+   """
+
+   def __init__(self,genomic_data_dir,acc_score_dir,don_score_dir,gen_fmt,sscore_fmt,chromo_list):
+      accessWrapper = DataAccessWrapper(genomic_data_dir,acc_score_dir,don_score_dir,gen_fmt,sscore_fmt)
+      self.seqInfo = SeqSpliceInfo(accessWrapper,chromo_list)
+
+      self.strands = ['+','-']
+   
+      # take care that it also works say if we get a chromo_list [1,3,5]
+      # those are mapped then to 0,1,2
+      self.chromo_list = chromo_list
+      self.chromo_map = {}
+      for idx,elem in enumerate(chromo_list):
+         self.chromo_map[elem] = idx
+
+      print 'Chromo mapping is %s' % str(self.chromo_map)
+
+      self.genomicSequences   = {}
+      self.acceptorScores     = {}
+      self.donorScores        = {}
+
+      for strandId in self.strands:
+         self.genomicSequences[strandId] = [0]*(len(self.chromo_list))
+         self.acceptorScores[strandId]   = [0]*(len(self.chromo_list))
+         self.donorScores[strandId]      = [0]*(len(self.chromo_list))
+
+      for chrId in self.chromo_list:
+         for strandId in self.strands:
+            print (chrId,strandId)
+            self.prefetch_seq_and_scores(chrId,strandId)
+
+   def get_seq_and_scores(self,chromo,strand,start,stop,dna_flat_files):
+      assert chromo in self.chromo_list
+      assert strand in self.strands
+
+      assert start <= stop
+
+      chromo_idx = self.chromo_map[chromo]
+
+      start -= 1
+      stop  -= 1
+
+      currentSequence         = self.genomicSequences[strand][chromo_idx][start:stop]
+
+      if strand == '-':
+         start += 2
+         stop  += 2
+
+      currentAcceptorScores   = self.acceptorScores[strand][chromo_idx][start:stop]
+      currentDonorScores      = self.donorScores[strand][chromo_idx][start:stop]
+
+      if strand == '-':
+         currentSequence = currentSequence[::-1]
+         currentAcceptorScores.reverse()
+         currentDonorScores.reverse()
+
+      return currentSequence, currentAcceptorScores, currentDonorScores
+
+
+   def prefetch_seq_and_scores(self,chromo,strand):
+      """
+      This function expects an interval, chromosome and strand information and
+      returns then the genomic sequence of this interval and the associated scores.
+      """
+      assert chromo in self.chromo_list
+      assert strand in self.strands
+
+      chromo_idx = self.chromo_map[chromo]
+
+      genomicSeq_start  = 1
+      genomicSeq_stop   = self.seqInfo.chromo_sizes[chromo]-1
+
+      print 'lt total_size %d' % self.seqInfo.chromo_sizes[chromo]
+
+      if strand == '-':
+         genomicSeq_stop   = self.seqInfo.chromo_sizes[chromo]+1
+
+      genomicSeq,currentAcc,currentDon = self.seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop,False,False)
+      genomicSeq = genomicSeq.lower()
+
+      if strand == '-':
+         genomicSeq = genomicSeq[::-1]
+         currentAcc.reverse()
+         currentDon.reverse()
+
+      self.genomicSequences[strand][chromo_idx] = genomicSeq
+      self.acceptorScores[strand][chromo_idx]   = currentAcc
+      self.donorScores[strand][chromo_idx]      = currentDon
index 7cf5e4c..7f5e580 100644 (file)
@@ -246,7 +246,7 @@ class SeqSpliceInfo():
       #chromo_string  = 'chr%d' % chromo
       #genomicSeq = load_genomic(chromo_string,strand,genomicSeq_start,genomicSeq_stop,self.flat_files,one_based=False)
       filename = self.accessWrapper.get_genomic_fragment_fn(chromo,strand)
-      print filename,genomicSeq_start,genomicSeq_stop
+      #print filename,genomicSeq_start,genomicSeq_stop
       genomicSeq = my_load_genomic(filename,'+',genomicSeq_start,genomicSeq_stop,one_based=False)
       genomicSeq = genomicSeq.strip().lower()
 
@@ -260,7 +260,7 @@ class SeqSpliceInfo():
       return genomicSeq
 
 
-   def get_seq_and_scores(self,chromo,strand,genomicSeq_start,genomicSeq_stop,only_seq=False):
+   def get_seq_and_scores(self,chromo,strand,genomicSeq_start,genomicSeq_stop,only_seq=False,perform_checks=True):
       """
       This function expects an interval, chromosome and strand information and
       returns then the genomic sequence of this interval and the associated scores.
@@ -330,9 +330,6 @@ class SeqSpliceInfo():
       gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if (p>0 and p<len(genomicSeq)-1 and e=='g' ) and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
       ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and genomicSeq[p-1]=='a' and genomicSeq[p]=='g']
 
-      #pdb.set_trace()
-      #return genomicSeq, currentAcc, currentDon
-    
       for pos in ag_tuple_pos:
          if pos+intervalBegin < NO_SCORE_WINDOW_SIZE and currentAcc[pos] == -inf:
             currentAcc[pos] = 1e-6
@@ -350,26 +347,18 @@ class SeqSpliceInfo():
       acc_pos = [p for p,e in enumerate(currentAcc) if e != -inf and p > 1]
       don_pos = [p for p,e in enumerate(currentDon) if e != -inf and p > 0]
 
-      print 'ACC: Chromo/strand: %d/%s' % (chromo,strand)
-      print ag_tuple_pos[:10],ag_tuple_pos[-10:]
-      print acc_pos[:10],acc_pos[-10:]
-      if not ag_tuple_pos == acc_pos:
+      if perform_checks and not ag_tuple_pos == acc_pos:
          print 'ACC: Chromo/strand: %d/%s' % (chromo,strand)
          print ag_tuple_pos[:10],ag_tuple_pos[-10:]
          print acc_pos[:10],acc_pos[-10:]
          pdb.set_trace()
 
-      print 'DON: Chromo/strand: %d/%s' % (chromo,strand)
-      print gt_tuple_pos[:10],gt_tuple_pos[-10:]
-      print don_pos[:10],don_pos[-10:]
-      if not gt_tuple_pos == don_pos:
+      if perform_checks and not gt_tuple_pos == don_pos:
          print 'DON: Chromo/strand: %d/%s' % (chromo,strand)
          print gt_tuple_pos[:10],gt_tuple_pos[-10:]
          print don_pos[:10],don_pos[-10:]
          pdb.set_trace()
 
-      #assert ag_tuple_pos == acc_pos, pdb.set_trace()
-      #assert gt_tuple_pos == don_pos, pdb.set_trace()
       assert len(genomicSeq) == len(currentAcc) == len(currentDon), pdb.set_trace()
 
       return genomicSeq, currentAcc, currentDon
index d4b92a8..e6e3a39 100644 (file)
@@ -247,13 +247,14 @@ class PipelineHeuristic:
 
          # forgot to do this
          if strand == '-':
-            pos = self.lt1.seqInfo.chromo_sizes[chr]-pos-self.read_size
+            #pos = self.lt1.seqInfo.chromo_sizes[chr]-pos-self.read_size
             unb_seq = reverse_complement(unb_seq)
 
          effective_len = len(unb_seq)
 
          genomicSeq_start  = pos
-         genomicSeq_stop   = pos+effective_len-1
+         #genomicSeq_stop   = pos+effective_len-1
+         genomicSeq_stop   = pos+effective_len
 
          start = cpu()
          #currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,run['dna_flat_files'])