+ wrapped sequence and score functions in order to reduce function calls
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Wed, 9 Jul 2008 17:16:25 +0000 (17:16 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Wed, 9 Jul 2008 17:16:25 +0000 (17:16 +0000)
git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@9953 e1793c9e-67f9-0310-80fc-b846ff1f7b36

qpalma/sequence_utils.py

index f82cd62..064ff03 100644 (file)
@@ -9,6 +9,7 @@
 #
 
 import os
+import os.path
 import pdb
 import re
 import subprocess
@@ -26,7 +27,7 @@ class DataAccessionException:
    pass
 
 
-def get_flatfile_size(filename):
+def get_flat_file_size(filename):
    cmd =  'wc -c %s | cut -f1 -d \' \'' % filename
    obj = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
    out,err = obj.communicate()
@@ -123,120 +124,136 @@ def create_bracket_seq(dna_seq,read_seq):
    return "".join(map(lambda x,y: ['[%s%s]'%(x,y),x][x==y],dna_seq,read_seq))
 
 
-def getSpliceScores(chromo,strand,intervalBegin,intervalEnd,total_size=0,genomicSeq=''):
-   """
-   Now we want to use interval_query to get the predicted splice scores trained
-   on the TAIR sequence and annotation.
-   """
 
-   size = intervalEnd-intervalBegin
-   assert size > 1, 'Error (getSpliceScores): interval size is less than 2!'
+class SeqSpliceInfo():
+   
+   def __init__(self,dna_flat_files,chromo_list):
+      assert os.path.isdir(dna_flat_files)
+      self.flat_files = dna_flat_files
+      
+      self.chromo_sizes = {}
+   
+      print "Fetching chromosome sizes..."
+      for chromo in chromo_list:
+         filename = os.path.join(self.flat_files,'chr%d.dna.flat' % chromo)
+         total_size = get_flat_file_size(filename)
+         self.chromo_sizes[chromo] = total_size
 
-   acc = size*[0.0]
-   don = size*[0.0]
+      print "done"
 
-   interval_matrix = createIntArrayFromList([intervalBegin,intervalEnd])
-   pos_size = new_intp()
-   intp_assign(pos_size,1)
 
-   # fetch acceptor scores
-   sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc/contig_%d%s'
-   acc = doIntervalQuery(chromo,strand,intervalBegin,intervalEnd,sscore_filename,total_size)
+   def getSpliceScores(self,chromo,strand,intervalBegin,intervalEnd,genomicSeq=''):
+      """
+      Now we want to use interval_query to get the predicted splice scores trained
+      on the TAIR sequence and annotation.
+      """
 
-   #ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and p<len(genomicSeq) and genomicSeq[p-1]=='a' and genomicSeq[p]=='g'][:100]
-   #acc_pos = [p for p,e in enumerate(acc) if e != -inf and p > 1][:100]
+      size = intervalEnd-intervalBegin
+      assert size > 1, 'Error (getSpliceScores): interval size is less than 2!'
 
-   # fetch donor scores
-   sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don/contig_%d%s'
-   don = doIntervalQuery(chromo,strand,intervalBegin,intervalEnd,sscore_filename,total_size)
+      acc = size*[0.0]
+      don = size*[0.0]
 
-   #gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>0 and p<len(genomicSeq)-1 and e=='g' and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
-   #don_pos = [p for p,e in enumerate(don) if e != -inf and p > 1][:100]
-   #pdb.set_trace()
+      interval_matrix = createIntArrayFromList([intervalBegin,intervalEnd])
+      pos_size = new_intp()
+      intp_assign(pos_size,1)
 
-   return acc, don
+      total_size = self.chromo_sizes[chromo]
 
+      # fetch acceptor scores
+      sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc/contig_%d%s'
+      acc = doIntervalQuery(chromo,strand,intervalBegin,intervalEnd,sscore_filename,total_size)
 
-def fetch_sequence(chromo,strand,genomicSeq_start,genomicSeq_stop,dna_flat_files):
-   chromo_string  = 'chr%d' % chromo
-   #genomicSeq = load_genomic(chromo_string,strand,genomicSeq_start-1,genomicSeq_stop,dna_flat_files,one_based=False)
-   #print 'loading %s...' % chromo_string 
-   genomicSeq = load_genomic(chromo_string,strand,genomicSeq_start,genomicSeq_stop,dna_flat_files,one_based=False)
-   genomicSeq = genomicSeq.strip().lower()
+      #ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and p<len(genomicSeq) and genomicSeq[p-1]=='a' and genomicSeq[p]=='g'][:100]
+      #acc_pos = [p for p,e in enumerate(acc) if e != -inf and p > 1][:100]
 
-   # check the obtained dna sequence
-   assert genomicSeq != '', 'load_genomic returned empty sequence!'
-   
-   # all entries other than a c g t are set to n
-   no_base = re.compile('[^acgt]')
-   genomicSeq = no_base.sub('n',genomicSeq)
+      # fetch donor scores
+      sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don/contig_%d%s'
+      don = doIntervalQuery(chromo,strand,intervalBegin,intervalEnd,sscore_filename,total_size)
 
-   return genomicSeq
+      #gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>0 and p<len(genomicSeq)-1 and e=='g' and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
+      #don_pos = [p for p,e in enumerate(don) if e != -inf and p > 1][:100]
+      #pdb.set_trace()
 
+      return acc, don
 
-def get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop,dna_flat_files,only_seq=False):
-   """
-   This function expects an interval, chromosome and strand information and
-   returns then the genomic sequence of this interval and the associated scores.
-   """
-   # Because splice site scores are predicted using a window of the sequence the
-   # first and the last NO_SCORE_WINDOW_SIZE nucleotides of a genomic sequence
-   # do not have any score predictions
-   NO_SCORE_WINDOW_SIZE = 205
 
-   assert genomicSeq_start < genomicSeq_stop
+   def fetch_sequence(self,chromo,strand,genomicSeq_start,genomicSeq_stop):
+      chromo_string  = 'chr%d' % chromo
+      genomicSeq = load_genomic(chromo_string,strand,genomicSeq_start,genomicSeq_stop,self.flat_files,one_based=False)
+      genomicSeq = genomicSeq.strip().lower()
 
-   genomicSeq = fetch_sequence(chromo,'+',genomicSeq_start,genomicSeq_stop,dna_flat_files)
+      # check the obtained dna sequence
+      assert genomicSeq != '', 'load_genomic returned empty sequence!'
+      
+      # all entries other than a c g t are set to n
+      no_base = re.compile('[^acgt]')
+      genomicSeq = no_base.sub('n',genomicSeq)
 
-   if only_seq:
       return genomicSeq
 
-   filename = os.path.join(dna_flat_files,'chr%d.dna.flat' % chromo)
-   total_size = get_flatfile_size(filename)
+   def get_seq_and_scores(self,chromo,strand,genomicSeq_start,genomicSeq_stop,only_seq=False):
+      """
+      This function expects an interval, chromosome and strand information and
+      returns then the genomic sequence of this interval and the associated scores.
+      """
+      # Because splice site scores are predicted using a window of the sequence the
+      # first and the last NO_SCORE_WINDOW_SIZE nucleotides of a genomic sequence
+      # do not have any score predictions
+      NO_SCORE_WINDOW_SIZE = 205
 
-   # shorten the intervalEnd to the maximal size of the genomic sequence if it would exceed this size
-   lookup_offset  = 0
-   intervalBegin  = max(genomicSeq_start-lookup_offset,0)
-   intervalEnd    = min(genomicSeq_stop+lookup_offset,total_size-1)
+      assert genomicSeq_start < genomicSeq_stop
 
-   if chromo > 7:
-      print intervalBegin,intervalEnd,total_size
-      currentAcc, currentDon = getSpliceScores(chromo-7,'-',intervalBegin,intervalEnd,total_size,genomicSeq)
-      currentDon = [-inf]+currentDon[:-1]
-   else:
-      currentAcc, currentDon = getSpliceScores(chromo,strand,intervalBegin,intervalEnd,total_size,genomicSeq)
-      currentAcc = currentAcc[2:]
-      currentDon = currentDon[1:]
-
-   length = len(genomicSeq)
-   currentAcc = currentAcc+[-inf]*(length-len(currentAcc))
-   currentDon = currentDon+[-inf]*(length-len(currentDon))
-   
-   #acc_pos = [p for p,e in enumerate(currentAcc) if e != -inf and p > 1]
-   #don_pos = [p for p,e in enumerate(currentDon) if e != -inf and p > 0]
-
-   gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>0 and p<len(genomicSeq)-1 and e=='g' and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
-   ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and p<len(genomicSeq) and genomicSeq[p-1]=='a' and genomicSeq[p]=='g']
+      genomicSeq = self.fetch_sequence(chromo,'+',genomicSeq_start,genomicSeq_stop)
 
-   #pdb.set_trace()
-   #return genomicSeq, currentAcc, currentDon
-   for pos in ag_tuple_pos:
-      if pos+intervalBegin < NO_SCORE_WINDOW_SIZE and currentAcc[pos] == -inf:
-         currentAcc[pos] = 1e-6
+      if only_seq:
+         return genomicSeq
 
-      if pos+intervalBegin > total_size - NO_SCORE_WINDOW_SIZE and currentDon[pos] == -inf:
-         currentAcc[pos] = 1e-6
+      total_size = self.chromo_sizes[chromo]
 
-   for pos in gt_tuple_pos:
-      if pos+intervalBegin < NO_SCORE_WINDOW_SIZE and currentDon[pos] == -inf:
-         currentDon[pos] = 1e-6
+      # shorten the intervalEnd to the maximal size of the genomic sequence if it would exceed this size
+      lookup_offset  = 0
+      intervalBegin  = max(genomicSeq_start-lookup_offset,0)
+      intervalEnd    = min(genomicSeq_stop+lookup_offset,total_size-1)
 
-      if pos+intervalBegin > total_size - NO_SCORE_WINDOW_SIZE and currentDon[pos] == -inf:
-         currentDon[pos] = 1e-6
-
-   assert ag_tuple_pos == [p for p,e in enumerate(currentAcc) if e != -inf and p > 1], pdb.set_trace()
-   assert gt_tuple_pos == [p for p,e in enumerate(currentDon) if e != -inf and p > 0], pdb.set_trace()
-   assert len(genomicSeq) == len(currentAcc) == len(currentDon), pdb.set_trace()
-
-   return genomicSeq, currentAcc, currentDon
+      if chromo > 7:
+         print intervalBegin,intervalEnd,total_size
+         currentAcc, currentDon = self.getSpliceScores(chromo-7,'-',intervalBegin,intervalEnd,total_size,genomicSeq)
+         currentDon = [-inf]+currentDon[:-1]
+      else:
+         currentAcc, currentDon = self.getSpliceScores(chromo,strand,intervalBegin,intervalEnd,total_size,genomicSeq)
+         currentAcc = currentAcc[2:]
+         currentDon = currentDon[1:]
+
+      length = len(genomicSeq)
+      currentAcc = currentAcc+[-inf]*(length-len(currentAcc))
+      currentDon = currentDon+[-inf]*(length-len(currentDon))
+      
+      #acc_pos = [p for p,e in enumerate(currentAcc) if e != -inf and p > 1]
+      #don_pos = [p for p,e in enumerate(currentDon) if e != -inf and p > 0]
+
+      gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>0 and p<len(genomicSeq)-1 and e=='g' and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
+      ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and p<len(genomicSeq) and genomicSeq[p-1]=='a' and genomicSeq[p]=='g']
+
+      #pdb.set_trace()
+      #return genomicSeq, currentAcc, currentDon
+    
+      for pos in ag_tuple_pos:
+         if pos+intervalBegin < NO_SCORE_WINDOW_SIZE and currentAcc[pos] == -inf:
+            currentAcc[pos] = 1e-6
+
+         if pos+intervalBegin > total_size - NO_SCORE_WINDOW_SIZE and currentAcc[pos] == -inf:
+            currentAcc[pos] = 1e-6
+
+      for pos in gt_tuple_pos:
+         if pos+intervalBegin < NO_SCORE_WINDOW_SIZE and currentDon[pos] == -inf:
+            currentDon[pos] = 1e-6
+
+         if pos+intervalBegin > total_size - NO_SCORE_WINDOW_SIZE and currentDon[pos] == -inf:
+            currentDon[pos] = 1e-6
+
+      assert ag_tuple_pos == [p for p,e in enumerate(currentAcc) if e != -inf and p > 1], pdb.set_trace()
+      assert gt_tuple_pos == [p for p,e in enumerate(currentDon) if e != -inf and p > 0], pdb.set_trace()
+      assert len(genomicSeq) == len(currentAcc) == len(currentDon), pdb.set_trace()
+
+      return genomicSeq, currentAcc, currentDon