+ extended test cases
[qpalma.git] / qpalma / sequence_utils.py
index 626ecc5..9f7ee6c 100644 (file)
@@ -286,7 +286,7 @@ class SeqSpliceInfo():
       return self.chromo_sizes[self.chromo_map[id]]
 
 
-   def getSpliceScores(self,id,strand,intervalBegin,intervalEnd,genomicSeq=''):
+   def getSpliceScores(self,id,strand,intervalBegin,intervalEnd):
       """
       Now we want to use interval_query to get the predicted splice scores trained
       on the TAIR sequence and annotation.
@@ -351,6 +351,8 @@ class SeqSpliceInfo():
          genomicSeq_stop  = s_stop
 
       assert genomicSeq_start < genomicSeq_stop, pdb.set_trace()
+      assert genomicSeq_start >= 0
+      assert genomicSeq_stop <= total_size
 
       genomicSeq = self.fetch_sequence(id,strand,genomicSeq_start,genomicSeq_stop)
 
@@ -363,27 +365,29 @@ class SeqSpliceInfo():
       # shorten the intervalEnd to the maximal size of the genomic sequence if it would exceed this size
       lookup_offset_begin  = 10
       lookup_offset_end    = 10
+
       intervalBegin  = max(genomicSeq_start-lookup_offset_begin,0)
       if intervalBegin == 0:
          lookup_offset_begin = 0
+         intervalBegin = genomicSeq_start
          
       intervalEnd    = min(genomicSeq_stop+lookup_offset_end,total_size-1)
       if intervalEnd == total_size-1:
          lookup_offset_end = 0
+         intervalEnd = genomicSeq_stop
 
-      currentAcc, currentDon = self.getSpliceScores(id,strand,intervalBegin,intervalEnd,genomicSeq)
-
-      if strand == '-':
-         currentAcc = currentAcc[1:]
-         #currentDon = currentDon[1:]
-         #currentDon = [-inf]+currentDon[:-1]
-      else:
-         currentAcc = currentAcc[2:]
-         currentDon = currentDon[1:]
+      currentAcc, currentDon = self.getSpliceScores(id,strand,intervalBegin,intervalEnd)
 
       # remove the offset positions
-      currentAcc = currentAcc[lookup_offset_begin:]
-      currentDon = currentDon[lookup_offset_begin:]
+      if strand == '+':
+         currentAcc = currentAcc[lookup_offset_begin+2:]
+         currentDon = currentDon[lookup_offset_begin+1:]
+      elif strand == '-':
+         currentAcc = currentAcc[lookup_offset_begin:]
+         currentDon = [-inf]+currentDon
+         currentDon = currentDon[(lookup_offset_begin):]
+      else:
+         assert False
 
       currentAcc = currentAcc[:len(genomicSeq)]
       currentDon = currentDon[:len(genomicSeq)]
@@ -393,6 +397,8 @@ class SeqSpliceInfo():
       currentDon = currentDon+[-inf]*(length-len(currentDon))
       currentDon[-1] = -inf
       
+      assert len(genomicSeq) == len(currentAcc) == len(currentDon), pdb.set_trace()
+
       gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if (p>0 and p<len(genomicSeq)-1 and e=='g' ) and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
       ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and genomicSeq[p-1]=='a' and genomicSeq[p]=='g']
 
@@ -413,18 +419,146 @@ class SeqSpliceInfo():
       acc_pos = [p for p,e in enumerate(currentAcc) if e != -inf and p > 1]
       don_pos = [p for p,e in enumerate(currentDon) if e != -inf and p > 0]
 
+      check_window_size = 30
+
       if perform_checks and not ag_tuple_pos == acc_pos:
          print 'ACC: Chromo/strand: %d/%s' % (id,strand)
-         print ag_tuple_pos[:10],ag_tuple_pos[-10:]
-         print acc_pos[:10],acc_pos[-10:]
+         print ag_tuple_pos[:check_window_size]
+         print acc_pos[:check_window_size]
+         print 
+         print ag_tuple_pos[-check_window_size:]
+         print acc_pos[-check_window_size:]
          pdb.set_trace()
 
       if perform_checks and not gt_tuple_pos == don_pos:
          print 'DON: Chromo/strand: %d/%s' % (id,strand)
-         print gt_tuple_pos[:10],gt_tuple_pos[-10:]
-         print don_pos[:10],don_pos[-10:]
+         print gt_tuple_pos[:check_window_size]
+         print don_pos[:check_window_size]
+         print 
+         print gt_tuple_pos[-check_window_size:]
+         print don_pos[-check_window_size:]
          pdb.set_trace()
 
-      assert len(genomicSeq) == len(currentAcc) == len(currentDon), pdb.set_trace()
 
       return genomicSeq, currentAcc, currentDon
+
+
+
+
+
+
+##################################
+##################################
+##################################
+##################################
+
+
+   def _get_seq_and_scores(self,id,strand,genomicSeq_start,genomicSeq_stop,only_seq=False,perform_checks=True):
+      """
+      This function expects an interval, chromosome and strand information and
+      returns then the genomic sequence of this interval and the associated scores.
+      """
+
+      chromo = self.chromo_map[id]
+
+      total_size = self.chromo_sizes[chromo]
+
+      if strand == '-':
+         s_start  = total_size - genomicSeq_stop
+         s_stop   = total_size - genomicSeq_start
+         genomicSeq_start = s_start
+         genomicSeq_stop  = s_stop
+
+      assert genomicSeq_start < genomicSeq_stop, pdb.set_trace()
+
+      genomicSeq = self.fetch_sequence(id,strand,genomicSeq_start,genomicSeq_stop)
+
+      if strand == '-':
+         genomicSeq = reverse_complement(genomicSeq)
+
+      if only_seq:
+         return genomicSeq
+
+      currentAcc,currentDon = self.get_scores(id,strand,genomicSeq_start,genomicSeq_stop,genomicSeq,total_size,perform_checks)
+
+      return genomicSeq, currentAcc, currentDon
+
+
+   def get_scores(self,id,strand,genomicSeq_start,genomicSeq_stop,genomicSeq,total_size,perform_checks):
+      # Because splice site scores are predicted using a window of the sequence the
+      # first and the last NO_SCORE_WINDOW_SIZE nucleotides of a genomic sequence
+      # do not have any score predictions
+      NO_SCORE_WINDOW_SIZE = 205
+
+      seq_size = len(genomicSeq)
+      # shorten the intervalEnd to the maximal size of the genomic sequence if it would exceed this size
+      lookup_offset_begin  = 10
+      lookup_offset_end    = 10
+
+      intervalBegin  = max(genomicSeq_start-lookup_offset_begin,0)
+      if intervalBegin == 0:
+         lookup_offset_begin = 0
+         
+      intervalEnd    = min(genomicSeq_stop+lookup_offset_end,total_size-1)
+      if intervalEnd == total_size:
+         lookup_offset_end = 0
+
+      print 'interval is: %d %d' % (intervalBegin,intervalEnd)
+
+      # splice score indices are 1-based
+      currentAcc, currentDon = self.getSpliceScores(id,strand,intervalBegin+1,intervalEnd+1)
+
+      print 'original sizes: %d %d' % (len(currentAcc),len(currentDon))
+
+      # remove the offset positions
+      currentAcc = currentAcc[(lookup_offset_begin+1):(lookup_offset_begin+1+seq_size)]
+      currentDon = currentDon[lookup_offset_begin:lookup_offset_begin+seq_size]
+
+      print 'all sizes'
+      print len(genomicSeq)
+      print len(currentAcc)
+      print len(currentDon)
+
+      gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if p<len(genomicSeq)-1 and e=='g' and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
+      ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and e=='g' and genomicSeq[p-1]=='a' ]
+
+      for pos in ag_tuple_pos:
+         if pos+genomicSeq_start < NO_SCORE_WINDOW_SIZE and currentAcc[pos] == -inf:
+            currentAcc[pos] = 1e-6
+
+         if pos+genomicSeq_start > total_size - NO_SCORE_WINDOW_SIZE and currentAcc[pos] == -inf:
+            currentAcc[pos] = 1e-6
+
+      for pos in gt_tuple_pos:
+         if pos+genomicSeq_start < NO_SCORE_WINDOW_SIZE and currentDon[pos] == -inf:
+            currentDon[pos] = 1e-6
+
+         if pos+genomicSeq_start > total_size - NO_SCORE_WINDOW_SIZE and currentDon[pos] == -inf:
+            currentDon[pos] = 1e-6
+
+      acc_pos = [p for p,e in enumerate(currentAcc) if e != -inf and p > 1]
+      don_pos = [p for p,e in enumerate(currentDon) if e != -inf and p > 0]
+
+      check_window_size = 30
+
+      if perform_checks and not ag_tuple_pos == acc_pos:
+         print 'ACC: Chromo/strand: %d/%s' % (id,strand)
+         print ag_tuple_pos[:check_window_size]
+         print acc_pos[:check_window_size]
+         print 
+         print ag_tuple_pos[-check_window_size:]
+         print acc_pos[-check_window_size:]
+         pdb.set_trace()
+
+      if perform_checks and not gt_tuple_pos == don_pos:
+         print 'DON: Chromo/strand: %d/%s' % (id,strand)
+         print gt_tuple_pos[:check_window_size]
+         print don_pos[:check_window_size]
+         print ''
+         print gt_tuple_pos[-check_window_size:]
+         print don_pos[-check_window_size:]
+         pdb.set_trace()
+
+      assert len(genomicSeq) == len(currentAcc) == len(currentDon), pdb.set_trace()
+
+      return currentAcc,currentDon