+ Fixed issue with g being last element of a sequence (one donor score too much)
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Thu, 10 Jul 2008 14:39:42 +0000 (14:39 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Thu, 10 Jul 2008 14:39:42 +0000 (14:39 +0000)
git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@9991 e1793c9e-67f9-0310-80fc-b846ff1f7b36

qpalma/sequence_utils.py

index 064ff03..7003c7c 100644 (file)
@@ -164,17 +164,10 @@ class SeqSpliceInfo():
       sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc/contig_%d%s'
       acc = doIntervalQuery(chromo,strand,intervalBegin,intervalEnd,sscore_filename,total_size)
 
-      #ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and p<len(genomicSeq) and genomicSeq[p-1]=='a' and genomicSeq[p]=='g'][:100]
-      #acc_pos = [p for p,e in enumerate(acc) if e != -inf and p > 1][:100]
-
       # fetch donor scores
       sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don/contig_%d%s'
       don = doIntervalQuery(chromo,strand,intervalBegin,intervalEnd,sscore_filename,total_size)
 
-      #gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>0 and p<len(genomicSeq)-1 and e=='g' and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
-      #don_pos = [p for p,e in enumerate(don) if e != -inf and p > 1][:100]
-      #pdb.set_trace()
-
       return acc, don
 
 
@@ -204,7 +197,10 @@ class SeqSpliceInfo():
 
       assert genomicSeq_start < genomicSeq_stop
 
-      genomicSeq = self.fetch_sequence(chromo,'+',genomicSeq_start,genomicSeq_stop)
+      if strand == '-':
+         genomicSeq = self.fetch_sequence(chromo+7,'+',genomicSeq_start,genomicSeq_stop)
+      else:
+         genomicSeq = self.fetch_sequence(chromo,'+',genomicSeq_start,genomicSeq_stop)
 
       if only_seq:
          return genomicSeq
@@ -212,28 +208,40 @@ class SeqSpliceInfo():
       total_size = self.chromo_sizes[chromo]
 
       # shorten the intervalEnd to the maximal size of the genomic sequence if it would exceed this size
-      lookup_offset  = 0
-      intervalBegin  = max(genomicSeq_start-lookup_offset,0)
-      intervalEnd    = min(genomicSeq_stop+lookup_offset,total_size-1)
+      lookup_offset_begin  = 10
+      lookup_offset_end    = 10
+      intervalBegin  = max(genomicSeq_start-lookup_offset_begin,0)
+      if intervalBegin == 0:
+         lookup_offset_begin = 0
+         
+      intervalEnd    = min(genomicSeq_stop+lookup_offset_end,total_size-1)
+      if intervalEnd == total_size-1:
+         lookup_offset_end = 0
+
+      currentAcc, currentDon = self.getSpliceScores(chromo,strand,intervalBegin,intervalEnd,genomicSeq)
+
+      original_acc = list(currentAcc)
 
-      if chromo > 7:
-         print intervalBegin,intervalEnd,total_size
-         currentAcc, currentDon = self.getSpliceScores(chromo-7,'-',intervalBegin,intervalEnd,total_size,genomicSeq)
+      if strand == '-':
          currentDon = [-inf]+currentDon[:-1]
       else:
-         currentAcc, currentDon = self.getSpliceScores(chromo,strand,intervalBegin,intervalEnd,total_size,genomicSeq)
          currentAcc = currentAcc[2:]
          currentDon = currentDon[1:]
 
-      length = len(genomicSeq)
-      currentAcc = currentAcc+[-inf]*(length-len(currentAcc))
-      currentDon = currentDon+[-inf]*(length-len(currentDon))
-      
-      #acc_pos = [p for p,e in enumerate(currentAcc) if e != -inf and p > 1]
-      #don_pos = [p for p,e in enumerate(currentDon) if e != -inf and p > 0]
+      # remove the offset positions
+      currentAcc = currentAcc[lookup_offset_begin:]
+      currentDon = currentDon[lookup_offset_begin:]
+
+      currentAcc = currentAcc[:len(genomicSeq)]
+      currentDon = currentDon[:len(genomicSeq)]
 
-      gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>0 and p<len(genomicSeq)-1 and e=='g' and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
-      ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and p<len(genomicSeq) and genomicSeq[p-1]=='a' and genomicSeq[p]=='g']
+      #length = len(genomicSeq)
+      #currentAcc = currentAcc+[-inf]*(length-len(currentAcc))
+      #currentDon = currentDon+[-inf]*(length-len(currentDon))
+      currentDon[-1] = -inf
+      
+      gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if (p>0 and p<len(genomicSeq)-1 and e=='g' ) and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
+      ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and genomicSeq[p-1]=='a' and genomicSeq[p]=='g']
 
       #pdb.set_trace()
       #return genomicSeq, currentAcc, currentDon
@@ -252,8 +260,23 @@ class SeqSpliceInfo():
          if pos+intervalBegin > total_size - NO_SCORE_WINDOW_SIZE and currentDon[pos] == -inf:
             currentDon[pos] = 1e-6
 
-      assert ag_tuple_pos == [p for p,e in enumerate(currentAcc) if e != -inf and p > 1], pdb.set_trace()
-      assert gt_tuple_pos == [p for p,e in enumerate(currentDon) if e != -inf and p > 0], pdb.set_trace()
+      acc_pos = [p for p,e in enumerate(currentAcc) if e != -inf and p > 1]
+      don_pos = [p for p,e in enumerate(currentDon) if e != -inf and p > 0]
+
+      if not ag_tuple_pos == acc_pos:
+         print 'ACC: Chromo/strand: %d/%s' % (chromo,strand)
+         print ag_tuple_pos[:10],ag_tuple_pos[-10:]
+         print acc_pos[:10],acc_pos[-10:]
+         #pdb.set_trace()
+
+      if not gt_tuple_pos == don_pos:
+         print 'DON: Chromo/strand: %d/%s' % (chromo,strand)
+         print gt_tuple_pos[:10],gt_tuple_pos[-10:]
+         print don_pos[:10],don_pos[-10:]
+         #pdb.set_trace()
+
+      #assert ag_tuple_pos == acc_pos, pdb.set_trace()
+      #assert gt_tuple_pos == don_pos, pdb.set_trace()
       assert len(genomicSeq) == len(currentAcc) == len(currentDon), pdb.set_trace()
 
       return genomicSeq, currentAcc, currentDon